In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [20]:
datadf = pd.read_csv('../data/modelData.csv')

### Check original data

In [21]:
datadf.shape

(31962, 3)

#### Drop Duplicates

In [22]:
datadf.drop_duplicates(inplace = True)

In [23]:
datadf.shape

(31962, 3)

In [24]:
datadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 998.8+ KB


In [25]:
datadf['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [26]:
datadf.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


### Cleaning tweets

In [27]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Clean Dataset

In [28]:
#create empty list
corpus = []
# loop thru our train dataset 
for i in range (0, len(datadf)):
    tweet = datadf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)


In [29]:
len(corpus)

31962

In [30]:
datadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.2+ MB


In [31]:
datadf['cleaned'] = np.array(corpus)

In [32]:
data = datadf.drop(columns=['id', 'tweet'])

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    31962 non-null  int64 
 1   cleaned  31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.0+ MB


In [34]:
data.columns

Index(['label', 'cleaned'], dtype='object')

In [35]:
type(data)

pandas.core.frame.DataFrame

In [36]:
non_hate = data[data['label'] == 0]
non_hate.shape

(29720, 2)

In [37]:
hate = data[data['label'] == 1]
hate.shape

(2242, 2)

# Extracting Features

In [38]:
data.columns, data.shape

(Index(['label', 'cleaned'], dtype='object'), (31962, 2))

### 1. Bag of Words BOW

In [39]:
#train and test sets
from sklearn.feature_extraction.text import CountVectorizer

bow_cv = CountVectorizer(max_features=1000)
train_bow = bow_cv.fit_transform(data['cleaned']).toarray()

In [40]:
train_bow.shape

(31962, 1000)

# Build the models

### Cross Validating Train Set

In [41]:
from sklearn.model_selection import train_test_split
X_train_bow, X_val_bow, y_train, y_val = train_test_split(train_bow, data['label'], test_size = 0.2, random_state = 42)

##### BOW Features

In [42]:
X_train_bow.shape, X_val_bow.shape, y_train.shape, y_val.shape

((25569, 1000), (6393, 1000), (25569,), (6393,))

### Naive Bayes

In [43]:
from sklearn.naive_bayes import GaussianNB

### 1. BOW Features

In [44]:
nb_bow = GaussianNB().fit( X_train_bow, y_train)
prediction_bow = nb_bow.predict(X_val_bow)

In [45]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
print(f"F1 score : {f1_score(y_val, prediction_bow)}")
print(f"Training Data Score: {nb_bow.score(X_train_bow, y_train)}")
print(f"Validation Data Score: {nb_bow.score(X_val_bow, y_val)}")
print(classification_report(y_val, prediction_bow))

F1 score : 0.20965382740126767
Training Data Score: 0.4964214478470022
Validation Data Score: 0.4928828406069138
              precision    recall  f1-score   support

           0       0.99      0.46      0.63      5937
           1       0.12      0.94      0.21       456

    accuracy                           0.49      6393
   macro avg       0.55      0.70      0.42      6393
weighted avg       0.93      0.49      0.60      6393

