In [None]:
import pandas as pd

#display full sentence
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', None)  


In [None]:
df=pd.read_csv('https://raw.githubusercontent.com/talhaanwarch/Offensive-Language-Detection/master/Data/olid-training-v1.0.tsv',sep='\t')
df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans what their take on this is.,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of shit to a volcano. 😂""",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illegals to move into red states,NOT,,


In [None]:
df=df[['tweet','subtask_a']]
df.columns=['tweet','label']

# split the data

In [None]:
from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(df,test_size=0.2)

# Preprocess the data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import string
def preprocess(df):
  df["tweet"]= df["tweet"].str.lower()
  df['tweet']=df['tweet'].str.replace('\d+', '')#remove digits
  #remove @user
  df['tweet']=df['tweet'].str.replace('@user','')
  #remove url
  df["tweet"] = df["tweet"].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
          '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','')

  #remove punctuations
  df["tweet"] = df["tweet"].apply(lambda x:x.translate(str.maketrans('', '', string.punctuation)))

  df["tweet"] = df["tweet"].str.replace('[^\w\s]','')

  #lemmatize
  df["tweet"] = df["tweet"].apply(lambda x : [lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(x)])
  #remove stop words
  df["tweet"] = df["tweet"].apply(lambda x: [item for item in x if item not in stop])
  #convert list to string
  df["tweet"] = df["tweet"].apply(lambda x : " ".join(x))
  return df

In [None]:
train_df=preprocess(train_df)
test_df=preprocess(test_df)

train_df.head()

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  


Unnamed: 0,tweet,label
9187,political zealotry amp tribalism new religion america ha problem contributing iturl partisanship tcot conservative conservative gop democrat progressive liberal think reason,NOT
5602,moore got elected liberal never learn unlikable trump,OFF
11520,gun safety reform exactly doe mean mean gun control,NOT
3110,lmaoo glad know one people wa meaning,NOT
3062,liberal defend islam yet know little nothing ideology middle eastern christian pay close attention muslim people,NOT


# Enocode the label

In [None]:
from sklearn.preprocessing import LabelEncoder
enocder=LabelEncoder()
train_df['label']=enocder.fit_transform(train_df['label'])
test_df['label']=enocder.transform(test_df['label'])


In [None]:
train_df.head()

Unnamed: 0,tweet,label
9187,political zealotry amp tribalism new religion america ha problem contributing iturl partisanship tcot conservative conservative gop democrat progressive liberal think reason,0
5602,moore got elected liberal never learn unlikable trump,1
11520,gun safety reform exactly doe mean mean gun control,0
3110,lmaoo glad know one people wa meaning,0
3062,liberal defend islam yet know little nothing ideology middle eastern christian pay close attention muslim people,0


In [None]:
train_df.label.value_counts()

0    7041
1    3551
Name: label, dtype: int64

# Model training


## create class weights

In [None]:
np.unique(train_df.label)

array([0, 1])

In [None]:
from sklearn.utils import class_weight
import numpy as np
class_weights = class_weight.compute_class_weight('balanced',classes=np.unique(train_df.label),y=train_df.label.to_numpy())
class_weights=dict(enumerate(class_weights))
class_weights

{0: 0.7521658855276239, 1: 1.491410870177415}

## hyper-parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

def logistic_param_selection(train_df):
    C= [1,3,5,7,8,9,10,12,15,20,25]
    # solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    grid ={'logisticregression__C':C}
    vectorizer=CountVectorizer()
    clf=LogisticRegression(solver='lbfgs',max_iter=500,class_weight=class_weights)

    pipeline=make_pipeline(vectorizer, clf)
    grid_search = GridSearchCV(pipeline,param_grid=grid, cv=5,scoring='f1_macro')
    grid_search.fit(train_df.tweet, train_df.label)
 
    return grid_search.best_score_, grid_search.best_params_

In [None]:
cv_score,tuned_parameters=logistic_param_selection(train_df)


In [None]:
print('5 fold score',cv_score)
print('tuned parameter',tuned_parameters)


5 fold score 0.7077318813930411
tuned parameter {'logisticregression__C': 1}


## train and test model

In [None]:
tuned_parameters['logisticregression__C']

1

In [None]:
classifier = LogisticRegression(solver='lbfgs',C=tuned_parameters['logisticregression__C'],max_iter=500,class_weight=class_weights)
vectorizer=CountVectorizer()
train_vector=vectorizer.fit_transform(train_df.tweet)
test_vector=vectorizer.transform(test_df.tweet)

In [None]:
classifier.fit(train_vector, train_df.label)
y_pred=classifier.predict(test_vector)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_df.label,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1799
           1       0.61      0.58      0.60       849

    accuracy                           0.75      2648
   macro avg       0.71      0.71      0.71      2648
weighted avg       0.75      0.75      0.75      2648



# save model in pickle format

In [None]:
import pickle
import os
if not os.path.exists('models'):
  os.mkdir('models')

#save classifier
with open("models/logisticregression.pkl", 'wb') as file:
    pickle.dump(classifier, file)

#save vectorizer
with open("models/countvectorizer.pkl", 'wb') as file:
    pickle.dump(vectorizer, file)

#save encoder
with open("models/labelencoder.pkl", 'wb') as file:
    pickle.dump(enocder, file)

# Inference

## load saved pkl files

In [None]:
import pickle
# Load the saved model
with open("models/logisticregression.pkl", 'rb') as file:
    classifier = pickle.load(file)

# Load the saved vectorizer
with open("models/countvectorizer.pkl", 'rb') as file:
    vectorizer = pickle.load(file)

# Load the saved encoder
with open("models/labelencoder.pkl", 'rb') as file:
    encoder = pickle.load(file)

## preprocess

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
if not os.path.exists('nltkdata'):
  os.mkdir('nltkdata')
  nltk.download('wordnet',download_dir='nltkdata')
  nltk.download('stopwords',download_dir='nltkdata')
  nltk.download('omw-1.4',download_dir='nltkdata')

nltk.data.path.append("nltkdata")

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')


[nltk_data] Downloading package wordnet to nltkdata...
[nltk_data] Downloading package stopwords to nltkdata...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to nltkdata...


In [None]:
import re
import string
def preprocessing(text):
  #apply preprocessing
  text=text.lower()
  text=re.sub('\d+', '',text)
  text=re.sub('@user', '',text)
  text=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
          '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',text)
  
  text=text.translate(str.maketrans('', '', string.punctuation))
  text=re.sub('[^\w\s]', '',text)

  text=[lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(text)]
  text=[item for item in text if item not in stop]
  text= " ".join(text)
  return text

## predict

In [None]:
def predict_result(text):
  text=preprocessing(text)
  vect=vectorizer.transform([text])
  pred=classifier.predict(vect)
  result=encoder.inverse_transform(pred)
  return result[0]


In [None]:
text='Hello, you better to shut your mouth'
predict_result(text)

'OFF'

In [None]:
text='Hello, how are you'
predict_result(text)

'NOT'

# sys argument

In [None]:
%%writefile app.py
import sys
import pickle
# Load the saved model
with open("models/logisticregression.pkl", 'rb') as file:
    classifier = pickle.load(file)

# Load the saved vectorizer
with open("models/countvectorizer.pkl", 'rb') as file:
    vectorizer = pickle.load(file)

# Load the saved encoder
with open("models/labelencoder.pkl", 'rb') as file:
    encoder = pickle.load(file)

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
if not os.path.exists('nltkdata'):
  os.mkdir('nltkdata')
  nltk.download('wordnet',download_dir='nltkdata')
  nltk.download('stopwords',download_dir='nltkdata')
  nltk.download('omw-1.4',download_dir='nltkdata')

nltk.data.path.append("nltkdata")

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english')

import re
import string
def preprocessing(text):
  #apply preprocessing
  text=text.lower()
  text=re.sub('\d+', '',text)
  text=re.sub('@user', '',text)
  text=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
          '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',text)
  
  text=text.translate(str.maketrans('', '', string.punctuation))
  text=re.sub('[^\w\s]', '',text)

  text=[lemmatizer.lemmatize(y) for y in w_tokenizer.tokenize(text)]
  text=[item for item in text if item not in stop]
  text= " ".join(text)
  return text

def predict_result(text):
  text=preprocessing(text)
  vect=vectorizer.transform([text])
  pred=classifier.predict(vect)
  result=encoder.inverse_transform(pred)
  return result[0]

text=sys.argv[1]
pred=predict_result(text)
print('prediction of "{}" is {}'.format(text,pred))

Writing app.py


In [None]:
!python app.py "Hello, you better to shut your mouth"

prediction of "Hello, you better to shut your mouth" is OFF


In [None]:
!zip -r  models.zip /content/models
!zip -r  nltkdata.zip /content/nltkdata

  adding: content/models/ (stored 0%)
  adding: content/models/labelencoder.pkl (deflated 16%)
  adding: content/models/logisticregression.pkl (deflated 15%)
  adding: content/models/countvectorizer.pkl (deflated 48%)
  adding: content/nltkdata/ (stored 0%)
  adding: content/nltkdata/corpora/ (stored 0%)
  adding: content/nltkdata/corpora/wordnet.zip (stored 0%)
  adding: content/nltkdata/corpora/omw-1.4.zip (stored 0%)
  adding: content/nltkdata/corpora/stopwords/ (stored 0%)
  adding: content/nltkdata/corpora/stopwords/azerbaijani (deflated 48%)
  adding: content/nltkdata/corpora/stopwords/slovene (deflated 72%)
  adding: content/nltkdata/corpora/stopwords/german (deflated 61%)
  adding: content/nltkdata/corpora/stopwords/english (deflated 54%)
  adding: content/nltkdata/corpora/stopwords/greek (deflated 64%)
  adding: content/nltkdata/corpora/stopwords/nepali (deflated 71%)
  adding: content/nltkdata/corpora/stopwords/chinese (deflated 51%)
  adding: content/nltkdata/corpora/stopwor