In [1]:
import pandas as pd
#import full sentence
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth',None)

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/talhaanwarch/Offensive-Language-Detection/master/Data/olid-training-v1.0.tsv',sep='\t')
df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans what their take on this is.,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of shit to a volcano. 😂""",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illegals to move into red states,NOT,,


In [3]:
df=df[['tweet','subtask_a']]
df.columns=['tweet','label']

# **Split the Data**

In [4]:
from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(df,test_size=0.2)

#**Preprocess the Data**

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer= WordNetLemmatizer()
w_tokenizer=nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
import string
def  preprocess(df):
  df["tweet"]=df["tweet"].str.lower()
  df['tweet']=df["tweet"].str.replace('\d+','')#remove digits
  #remove @user
  df['tweet']=df['tweet'].str.replace('@user','')
  #remove url
  df["tweet"]=df["tweet"].str.replace('https[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','')

  #remove punctuations
  df["tweet"]=df["tweet"].apply(lambda x:x.translate(str.maketrans('','',string.punctuation)))

  df["tweet"]=df["tweet"].str.replace('[^\w\s]','')

  #lemmatize
  df["tweet"]=df["tweet"].apply(lambda x: [lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(x)])
  #remove Stop words
  df["tweet"]=df["tweet"].apply(lambda x: [item for item in x if item not in stop ])
  #convert list to string
  df["tweet"]=df["tweet"].apply(lambda x : " ".join(x))
  return df

In [7]:
train_df=preprocess(train_df)
test_df=preprocess(test_df)

train_df.head()

Unnamed: 0,tweet,label
12834,achichincle lamebotas,OFF
895,rosenstein’s 300 attorney ‘helped’ vet kavanaugh – sexual abuse charge url go back highschool day cant proven amp named witness denies pathetic confirmkavanaugh pjnet ccot maga kag2018 trump2020 nomoreprogressiveliberals,OFF
8255,company hire illegals commit crime sb required cover legal fee fined malfeasance hiring first place never solve problem strict law passed,NOT
4781,paying tax optional tax evasion crime human right law 1a opinion issue preposterous gilliam anti 2a sensible gun control would prevent mentally ill people like owning gun want,OFF
7604,thing ain’t disputing nation problem never however say someone likely die earlier botswana anywhere world reach that’s felt exaggerating,NOT


In [8]:
from sklearn.preprocessing import LabelEncoder
enocder=LabelEncoder()
train_df['label']=enocder.fit_transform(train_df['label'])
test_df['label']=enocder.transform(test_df['label'])

In [9]:
train_df.head()

Unnamed: 0,tweet,label
12834,achichincle lamebotas,1
895,rosenstein’s 300 attorney ‘helped’ vet kavanaugh – sexual abuse charge url go back highschool day cant proven amp named witness denies pathetic confirmkavanaugh pjnet ccot maga kag2018 trump2020 nomoreprogressiveliberals,1
8255,company hire illegals commit crime sb required cover legal fee fined malfeasance hiring first place never solve problem strict law passed,0
4781,paying tax optional tax evasion crime human right law 1a opinion issue preposterous gilliam anti 2a sensible gun control would prevent mentally ill people like owning gun want,1
7604,thing ain’t disputing nation problem never however say someone likely die earlier botswana anywhere world reach that’s felt exaggerating,0


In [10]:
train_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,7044
1,3548


#**Model Training**

create class weights

In [11]:
train_df.label

Unnamed: 0,label
12834,1
895,1
8255,0
4781,1
7604,0
3220,1
4374,1
6523,0
10306,0
734,0


In [12]:
from sklearn.utils import class_weight
import numpy as np
class_weights= class_weight.compute_class_weight('balanced',classes=np.unique(train_df.label),y=train_df.label.to_numpy())
class_weights=dict(enumerate(class_weights))
class_weights

{0: np.float64(0.7518455423055083), 1: np.float64(1.4926719278466742)}

#**Hyper-Parameter tuning**

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

def logistic_param_selection(train_df):
  C=[1,3,5,7,9,10,12,15,20,25]
  #solver=['newton-cg','lbfgs','liblinear','sag','saga']
  grid={'logisticregression__C':C}
  vectorizer=CountVectorizer()
  clf=LogisticRegression(solver='lbfgs',max_iter=500,class_weight=class_weights)

  pipeline=make_pipeline(vectorizer,clf)
  grid_search=GridSearchCV(pipeline,param_grid=grid, cv=5, scoring='f1_macro')
  grid_search.fit(train_df.tweet, train_df.label)

  return grid_search.best_score_, grid_search.best_params_

In [14]:
cv_score,tuned_parameters=logistic_param_selection(train_df)

In [15]:
print('5 fold score',cv_score)
print('tuned parameter',tuned_parameters)

5 fold score 0.7127180281441609
tuned parameter {'logisticregression__C': 1}


#**Train and Test this Model**

In [16]:
tuned_parameters['logisticregression__C']

1

In [17]:
classifier=LogisticRegression(solver='lbfgs',C=tuned_parameters['logisticregression__C'],max_iter=500,class_weight=class_weights)
vectorizer=CountVectorizer()
train_vector=vectorizer.fit_transform(train_df.tweet)
test_vector=vectorizer.transform(test_df.tweet)

In [18]:
classifier.fit(train_vector,train_df.label)
y_pred=classifier.predict(test_vector)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(test_df.label,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81      1796
           1       0.60      0.58      0.59       852

    accuracy                           0.74      2648
   macro avg       0.70      0.70      0.70      2648
weighted avg       0.74      0.74      0.74      2648



#**Save the Model in the Pickle Format**

In [20]:
import pickle
import os
if not os.path.exists('models'):
  os.mkdir('models')
#save classifier
with open("models/logisticregression.pkl",'wb') as file:
  pickle.dump(classifier, file)
#save Vectorizer
with open("models/countvectorizer.pkl",'wb') as file:
  pickle.dump(vectorizer, file)
#save encoder
with open("models/labelencoder.pkl",'wb') as file:
  pickle.dump(enocder, file)

#**Inference**

load save pkl Files

In [21]:
import pickle
#Load the saved model
with open("models/logisticregression.pkl",'rb') as file:
  classifier=pickle.load(file)
#load the saved Vectrorizer
with open("models/countvectorizer.pkl",'rb') as file:
  vectorizer=pickle.load(file)
#load the saved encoder
with open("models/labelencoder.pkl",'rb') as file:
  encoder=pickle.load(file)

Preprocess

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
if not os.path.exists('nltkdata'):
  os.mkdir('nltkdata')
  nltk.download('wordnet',download_dir='nltkdata')
  nltk.download('stopwords',download_dir='nltkdata')
  nltk.download('omw-1.4',download_dir='nltkdata')
nltk.data.path.append("nltkdata")

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')

[nltk_data] Downloading package wordnet to nltkdata...
[nltk_data] Downloading package stopwords to nltkdata...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to nltkdata...


In [23]:
import re
import string
def preprocessing(text):
  #apply preprocessing
  text=text.lower()
  text=re.sub('\d+','',text)
  text=re.sub('@user','',text)
  text=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
          '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',text)
  text=text.translate(str.maketrans('','',string.punctuation))
  text=re.sub('[^\w\s]', '',text)

  text=[lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(text)]
  text=[item for item in text if item not in stop]
  text="  ".join(text)
  return text

#**Predict**

In [24]:
def predict_result(text):
  text=preprocessing(text)
  vect=vectorizer.transform([text])
  pred=classifier.predict(vect)
  result=encoder.inverse_transform(pred)
  return result[0]

In [25]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
text='Hello , will you shut your mouth !'
predict_result(text)

'OFF'

In [27]:
text='How are you ?'
predict_result(text)

'NOT'

#**Sys Argument**

In [36]:
%%writefile app.py
import sys
import pickle
#load the saved model

with open("models/logisticregression.pkl",'rb') as file:
  classifier=pickle.load(file)
#load the saved Vectrorizer
with open("models/countvectorizer.pkl",'rb') as file:
  vectorizer=pickle.load(file)
#load the saved encoder
with open("models/labelencoder.pkl",'rb') as file:
  encoder=pickle.load(file)

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
if not os.path.exists('nltkdata'):
  os.mkdir('nltkdata')
  nltk.download('wordnet',download_dir='nltkdata')
  nltk.download('stopwords',download_dir='nltkdata')
  nltk.download('omw-1.4',download_dir='nltkdata')

nltk.data.path.append("nltkdata")

lemmatizer=WordNetLemmatizer()
w_tokenizer=nltk.tokenize.WhitespaceTokenizer()
stop=stopwords.words('english')

import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def preprocessing(text):
    # Apply preprocessing
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'@user', '', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)

    text = [lemmatizer.lemmatize(y) for y in word_tokenize(text)]
    text = [item for item in text if item not in stop]
    text = " ".join(text)
    return text

def predict_result(text):
  text=preprocessing(text)
  vect=vectorizer.transform([text])
  pred=classifier.predict(vect)
  result=encoder.inverse_transform(pred)
  return result[0]

text=sys.argv[1]
pred=predict_result(text)
print('prediction of "{}" is {}'.format(text,pred))

Overwriting app.py


In [37]:
! python app.py "How are you ?"

Traceback (most recent call last):
  File "/content/app.py", line 58, in <module>
    pred=predict_result(text)
         ^^^^^^^^^^^^^^^^^^^^
  File "/content/app.py", line 51, in predict_result
    text=preprocessing(text)
         ^^^^^^^^^^^^^^^^^^^
  File "/content/app.py", line 45, in preprocessing
    text = [lemmatizer.lemmatize(y) for y in word_tokenize(text)]
                                             ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nltk/tokenize/__init__.py", line 142, in word_tokenize
    sentences = [text] if preserve_line else sent_tokenize(text, language)
                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nltk/tokenize/__init__.py", line 119, in sent_tokenize
    tokenizer = _get_punkt_tokenizer(language)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nltk/tokenize/__init__.py", line 105, in _get_punkt_tokenizer
    

In [40]:
! python app.py "what the heck ?"

Traceback (most recent call last):
  File "/content/app.py", line 58, in <module>
    pred=predict_result(text)
         ^^^^^^^^^^^^^^^^^^^^
  File "/content/app.py", line 51, in predict_result
    text=preprocessing(text)
         ^^^^^^^^^^^^^^^^^^^
  File "/content/app.py", line 45, in preprocessing
    text = [lemmatizer.lemmatize(y) for y in word_tokenize(text)]
                                             ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nltk/tokenize/__init__.py", line 142, in word_tokenize
    sentences = [text] if preserve_line else sent_tokenize(text, language)
                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nltk/tokenize/__init__.py", line 119, in sent_tokenize
    tokenizer = _get_punkt_tokenizer(language)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nltk/tokenize/__init__.py", line 105, in _get_punkt_tokenizer
    