### load data

In [None]:
import nltk
nltk.download("stopwords")

import pandas as pd
import numpy as np 

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import TweetTokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv('labeled_data.csv')
data.head()

new_columns = data.columns.values
new_columns[0] = 'id'
data.columns = new_columns
data.set_index('id', inplace=True)

In [None]:
# drop unnecessary columns 
data.drop(['count','hate_speech','offensive_language','neither'], axis='columns', inplace=True)
data

Unnamed: 0_level_0,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
25291,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
25292,2,"you've gone and broke the wrong heart baby, an..."
25294,1,young buck wanna eat!!.. dat nigguh like I ain...
25295,1,youu got wild bitches tellin you lies


### preprocess the text 

In [None]:
bad_symbols_re = re.compile('[^0-9a-z #@://]')
stopwords_re = set(stopwords.words('english'))
twitter_username_re = re.compile(r'@([A-Za-z0-9#]+)')
hashtag_re = re.compile(r'#([0-9]+)')
http_re = re.compile(r'https?:\/\/.*[\r\n]*')

def clean_text(text):
  text = text.lower()
  text = re.sub(bad_symbols_re,'',text)
  text = re.sub(twitter_username_re,'',text)
  text = re.sub(hashtag_re,'',text)
  text = re.sub(http_re,'',text)
  text = ' '.join([word for word in text.split() if word not in stopwords_re])
  return text

In [None]:
data['tweet'] = data['tweet'].apply(lambda x: clean_text(x))
data.head(25)

Unnamed: 0_level_0,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,rt : woman shouldnt complain cleaning house am...
1,1,rt : boy dats coldtyga dwn bad cuffin dat hoe ...
2,1,rt dawg rt : ever fuck bitch start cry confuse...
3,1,rt : look like tranny
4,1,rt : shit hear might true might faker bitch to...
5,1,: shit blows meclaim faithful somebody still f...
6,1,: sit hate another bitch got much shit going
7,1,: cause im tired big bitches coming us skinny ...
8,1,amp might get ya bitch back amp thats
9,1,:hobbies include: fighting mariambitch


In [None]:
data.loc[data['class'] == 1]

Unnamed: 0_level_0,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,rt : boy dats coldtyga dwn bad cuffin dat hoe ...
2,1,rt dawg rt : ever fuck bitch start cry confuse...
3,1,rt : look like tranny
4,1,rt : shit hear might true might faker bitch to...
5,1,: shit blows meclaim faithful somebody still f...
...,...,...
25287,1,really care bout dis bitch dick yo feelings
25288,1,worried bout bitches need
25291,1,yous muthafin lie : right tl trash mine bible ...
25294,1,young buck wanna eat dat nigguh like aint fuck...


### feature extraction

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = 42)
len(x_train), len(x_test)

(19826, 4957)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

stop_words = set(stopwords.words('english'))

tok = TweetTokenizer(
    preserve_case = False,
    reduce_len = True,
    strip_handles = True,
)

def tfidf_features(X_train, X_test):
  tfidf_vectorizer = TfidfVectorizer(
      token_pattern='(\S+)',
      decode_error = 'ignore',
      strip_accents = 'unicode',
      tokenizer = tok.tokenize,
      stop_words = stop_words,
      max_features = 5000, 
      ngram_range=(1, 3),
      sublinear_tf = True)
  X_train = tfidf_vectorizer.fit_transform(X_train)  
  X_test = tfidf_vectorizer.transform(X_test)

  return X_train, X_test, tfidf_vectorizer.vocabulary_

X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(x_train, x_test)

### find the best model 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE 

def find_best_model_using_gridsearchCV(X,y):
  algos = {
      'logistic_regression':{
          'model': LogisticRegression(),
          'params':{
              'solver' : ['newton-cg','sag','lbfgs'],
              'C':[1,2]
          }
      },
      'svm':{
          'model': SVC(),
          'params' :{
              'C':[1,2],
              'kernel':['poly','rbf','sigmoid']
          }
      },
      'naive_bayes':{
          'model': MultinomialNB(),
          'params':{
              'fit_prior':[True,False]
          }
      }
  }

  scores=[]
  cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
  for algo_name, config in algos.items():
    gs = GridSearchCV(config['model'], config['params'],cv=cv, return_train_score=False)
    sm = SMOTE(random_state = 2) 
    X_train_res, y_train_res = sm.fit_sample(X,y)
    gs.fit(X_train_res,y_train_res)
    scores.append({
        'model': algo_name,
        'best_score':gs.best_score_,
        'best_params':gs.best_params_
    })
  return pd.DataFrame(scores, columns=['model','best_score','best_params'])

In [None]:
find_best_model_using_gridsearchCV(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.914509,"{'C': 2, 'solver': 'lbfgs'}"
1,svm,0.956875,"{'C': 2, 'kernel': 'rbf'}"
2,naive_bayes,0.879783,{'fit_prior': True}


**Training and Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE 
from sklearn.linear_model import LogisticRegression

svm_clf = SVC(C=2, kernel='rbf')
lg_clf = LogisticRegression(C=2, solver='lbfgs')
sm = SMOTE(random_state = 2) 

# traininng
X_train_res, y_train_res = sm.fit_sample(X_train_tfidf,y_train)
lg_clf.fit(X_train_res, y_train_res)
y_pred = lg_clf.predict(X_test_tfidf)

# evaluation
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3']))



Confusion Matrix

[[ 153  101   36]
 [ 327 3319  186]
 [  49   62  724]]

Classification Report

              precision    recall  f1-score   support

     Class 1       0.29      0.53      0.37       290
     Class 2       0.95      0.87      0.91      3832
     Class 3       0.77      0.87      0.81       835

    accuracy                           0.85      4957
   macro avg       0.67      0.75      0.70      4957
weighted avg       0.88      0.85      0.86      4957



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
