In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [2]:
datadf = pd.read_csv('../data/modelData.csv')

### Check original data

In [3]:
datadf.shape

(31962, 3)

#### Drop Duplicates

In [4]:
datadf.drop_duplicates(inplace = True)

In [5]:
datadf.shape

(31962, 3)

In [6]:
datadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 998.8+ KB


In [7]:
datadf['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [8]:
datadf.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


### Cleaning tweets

In [9]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Clean Dataset

In [10]:
#create empty list
corpus = []
# loop thru our dataset 
for i in range (0, len(datadf)):
    tweet = datadf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)


In [11]:
len(corpus)

31962

In [12]:
datadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.2+ MB


In [13]:
datadf['cleaned'] = np.array(corpus)

In [14]:
data = datadf.drop(columns=['id', 'tweet'])

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    31962 non-null  int64 
 1   cleaned  31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.0+ MB


In [16]:
data.columns

Index(['label', 'cleaned'], dtype='object')

In [17]:
type(data)

pandas.core.frame.DataFrame

In [18]:
non_hate = data[data['label'] == 0]
non_hate.shape

(29720, 2)

In [19]:
hate = data[data['label'] == 1]
hate.shape

(2242, 2)

## Balancing Dataset Up-sampling Minority Class

https://elitedatascience.com/imbalanced-classes

In [20]:
from sklearn.utils import resample

In [21]:
#rename dfs with majority(non_hate) and nimority(hate)
data_majority = data[data['label'] == 0]
data_minority = data[data['label'] ==1]

In [22]:
#Upsample minority
data_minority_upsampled = resample (data_minority, replace=True, #sample with replacement
                                     n_samples=len(data_majority),# to match majority class
                                     random_state=42) # reproducible results 

In [23]:
#Concatanate train_minority_upsampled to train_majority
data_upsampled = pd.concat([data_minority_upsampled, data_majority])
#Display new class counts
data_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [24]:
data_upsampled.shape, data_upsampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59440 entries, 12213 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    59440 non-null  int64 
 1   cleaned  59440 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


((59440, 2), None)

# Extracting Features

### Tf-Idf

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
data_upsampled_tfidf = tfidf_vectorizer.fit_transform(data_upsampled['cleaned']).toarray()

In [26]:
data_upsampled_tfidf.shape

(59440, 1000)

# Build the models

### Cross Validating Train Set

In [27]:
from sklearn.model_selection import train_test_split
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(data_upsampled_tfidf, data_upsampled['label'], test_size = 0.2, random_state = 42)

##### Tf-Idf Features

In [28]:
X_train_tfidf.shape, X_val_tfidf.shape, y_train.shape, y_val.shape

((47552, 1000), (11888, 1000), (47552,), (11888,))

### Naive Bayes

In [29]:
from sklearn.naive_bayes import GaussianNB

In [30]:
nb_tfidf = GaussianNB().fit( X_train_tfidf, y_train)
prediction_tfidf = nb_tfidf.predict(X_val_tfidf)

In [31]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
print(f"F1 score : {f1_score(y_val, prediction_tfidf)}")
print(f"Training Data Score: {nb_tfidf.score(X_train_tfidf, y_train)}")
print(f"Validation Data Score: {nb_tfidf.score(X_val_tfidf, y_val)}")
print(classification_report(y_val, prediction_tfidf))

F1 score : 0.8699266503667482
Training Data Score: 0.8695112718707941
Validation Data Score: 0.8657469717362046
              precision    recall  f1-score   support

           0       0.90      0.82      0.86      6008
           1       0.84      0.91      0.87      5880

    accuracy                           0.87     11888
   macro avg       0.87      0.87      0.87     11888
weighted avg       0.87      0.87      0.87     11888



In [32]:
!pip install joblib



In [33]:
import joblib

In [34]:
filename = 'nb_tfidf_balanced.pkl'
joblib.dump(nb_tfidf, filename)

['nb_tfidf_balanced.pkl']

In [35]:
loaded_model = joblib.load(filename)

In [36]:
result = loaded_model.score(X_val_tfidf, y_val)

In [37]:
print(result)

0.8657469717362046
