In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [2]:
datadf = pd.read_csv('../data/modelData.csv')

### Check original data

In [3]:
datadf.shape,

((31962, 3),)

#### Drop Duplicates

In [4]:
datadf.drop_duplicates(inplace = True)

In [5]:
datadf.shape

(31962, 3)

In [6]:
datadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 998.8+ KB


In [7]:
datadf['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [8]:
datadf.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


### Cleaning tweets

In [9]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Clean Dataset

In [10]:
#create empty list
corpus = []
# loop thru our dataset 
for i in range (0, len(datadf)):
    tweet = datadf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)


In [11]:
len(corpus)

31962

In [12]:
datadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.2+ MB


In [13]:
datadf['cleaned'] = np.array(corpus)

In [14]:
data = datadf.drop(columns=['id', 'tweet'])

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    31962 non-null  int64 
 1   cleaned  31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.0+ MB


In [16]:
data.columns

Index(['label', 'cleaned'], dtype='object')

In [17]:
type(data)

pandas.core.frame.DataFrame

In [30]:
non_hate = data[data['label'] == 0]
non_hate.shape

(29720, 2)

In [31]:
hate = data[data['label'] == 1]
hate.shape

(2242, 2)

# Extracting Features

### 1. Bag of Words BOW

In [19]:
#train and test sets
from sklearn.feature_extraction.text import CountVectorizer

bow_cv = CountVectorizer(max_features=1000)
data_bow = bow_cv.fit_transform(data['cleaned']).toarray()

In [20]:
data_bow.shape

(31962, 1000)

### 2. Tf-Idf

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
data_tfidf = tfidf_vectorizer.fit_transform(data['cleaned']).toarray()

In [24]:
data_tfidf.shape

(31962, 1000)

# Build the models

### 1. Random Forest

In [25]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

### Cross Validating Train Set

#### BOW Features

In [26]:
from sklearn.model_selection import train_test_split
X_train_bow, X_val_bow, y_train, y_val = train_test_split(data_bow, data['label'], test_size = 0.2, random_state = 42)

In [27]:
X_train_bow.shape, X_val_bow.shape, y_train.shape, y_val.shape

((25569, 1000), (6393, 1000), (25569,), (6393,))

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(X_train_bow, y_train)
prediction = rf.predict(X_val_bow)

In [38]:
print(f"F1 score : {f1_score(y_val, prediction)}")
print(f"Training Data Score: {rf.score(X_train_bow, y_train)}")
print(f"Validation Data Score: {rf.score(X_val_bow, y_val)}")
print(classification_report(y_val, prediction))

F1 score : 0.5790055248618784
Training Data Score: 0.9954632562869099
Validation Data Score: 0.9404035664007508
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5937
           1       0.58      0.57      0.58       456

    accuracy                           0.94      6393
   macro avg       0.78      0.77      0.77      6393
weighted avg       0.94      0.94      0.94      6393



#### Tf-Idf Features

In [29]:
X_train_tfidf = data_tfidf[y_train.index]
X_val_tfidf = data_tfidf[y_val.index]

In [40]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(X_train_tfidf, y_train)
prediction_tfidf = rf.predict(X_val_tfidf)

In [41]:
print(f"F1 score : {f1_score(y_val, prediction_tfidf)}")
print(f"Training Data Score: {rf.score(X_train_tfidf, y_train)}")
print(f"Validation Data Score: {rf.score(X_val_tfidf, y_val)}")
print(classification_report(y_val, prediction_tfidf))

F1 score : 0.5975773889636609
Training Data Score: 0.9954241464273144
Validation Data Score: 0.9532300954168622
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5937
           1       0.77      0.49      0.60       456

    accuracy                           0.95      6393
   macro avg       0.87      0.74      0.79      6393
weighted avg       0.95      0.95      0.95      6393



### 2. SVM Support Vector Machine

In [42]:
from sklearn import svm

#### BOW Features

In [43]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(X_train_bow, y_train)

prediction = svc.predict_proba(X_val_bow)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
f1_score(y_val, prediction_int)

0.5520974289580514

In [44]:
print(f"F1 score : {f1_score(y_val, prediction_int)}")
print(f"Training Data Score: {svc.score(X_train_bow, y_train)}")
print(f"Validation Data Score: {svc.score(X_val_bow, y_val)}")
print(classification_report(y_val, prediction_int))

F1 score : 0.5520974289580514
Training Data Score: 0.9536939262388048
Validation Data Score: 0.9472860941654935
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5937
           1       0.72      0.45      0.55       456

    accuracy                           0.95      6393
   macro avg       0.84      0.72      0.76      6393
weighted avg       0.94      0.95      0.94      6393



#### Tf-Idf Features

In [45]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(X_train_tfidf, y_train)

prediction_smv = svc.predict_proba(X_val_tfidf)
prediction_int = prediction_smv[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
f1_score(y_val, prediction_int)

0.5396825396825397

In [46]:
print(f"F1 score : {f1_score(y_val, prediction_int)}")
print(f"Training Data Score: {svc.score(X_train_tfidf, y_train)}")
print(f"Validation Data Score: {svc.score(X_val_tfidf, y_val)}")
print(classification_report(y_val, prediction_int))

F1 score : 0.5396825396825397
Training Data Score: 0.9524424107317455
Validation Data Score: 0.9468168309088065
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      5937
           1       0.68      0.45      0.54       456

    accuracy                           0.95      6393
   macro avg       0.82      0.72      0.76      6393
weighted avg       0.94      0.95      0.94      6393

