In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("data/new_hate.csv")

In [4]:
new_df = df.copy()

In [5]:
df.head(4)

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N


## Exploring the dataset

In [6]:
#### We will rename first unnamed: 0 column

df.rename(columns={'Unnamed: 0': 'serial'}, inplace=True)

In [7]:
df.sample(2)

Unnamed: 0,serial,comment,label
9509,9555,"most women who wear the burka want to wear it,...",P
30574,30657,Repeat after me: all trans are F ? C K I N G ...,N


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   serial   41144 non-null  int64 
 1   comment  41144 non-null  object
 2   label    41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB


In [9]:
df.describe()

Unnamed: 0,serial
count,41144.0
mean,20703.991056
std,12580.88404
min,1.0
25%,10295.75
50%,20593.5
75%,30894.25
max,331000.0


In [10]:
df[['comment', 'label']].describe().T

Unnamed: 0,count,unique,top,freq
comment,41144,41134,Blacks are such pondlife,5
label,41144,3,N,22158


#### Looking at above pivot table we can understand that label column might be imbalance and comment column contains duplicate values. Lets find out.

In [11]:
df.isnull().sum()  ## No missing values.

serial     0
comment    0
label      0
dtype: int64

In [12]:
dup = df[df.duplicated(keep = False)]

In [14]:
for _, group in dup.groupby(list(dup.columns)):
    print(group)

#### Lets drop the duplicate data.

In [15]:
df.drop_duplicates(keep='first', inplace=True)

In [16]:
df[df.duplicated()] # Dupicate columns removed.

Unnamed: 0,serial,comment,label


In [17]:
df["label"].value_counts()

label
N    22158
P    18950
O       36
Name: count, dtype: int64

In [18]:
len(df[df.label == 'N']) / len(df[df.label == 'P'])

1.169287598944591

#### So our majority class is just 1.17 times more than minority class and it dosen't comes under the severe category. So we will leave as it is because for sentiment analysis it can distort the dataset.

In [19]:
df['label'].loc[df['label'] == 'O' ] = df[df['label'] == 'O']['comment'].str.extract('([NPO])', expand = False)

In [18]:
df.label.value_counts()

label
N    22174
P    18967
O        2
Name: count, dtype: int64

In [19]:
df.label[df['label'] == 'O']

7642     O
31213    O
Name: label, dtype: object

In [20]:
df.label.loc[7642] = df['comment'].loc[7642][-2]

In [21]:
df.label.loc[31213] = df['comment'].loc[31213][-2]

In [22]:
df.label.value_counts()

label
N    22175
P    18968
Name: count, dtype: int64

In [23]:
df.label[df.label.isnull()]

14616    NaN
Name: label, dtype: object

#### As we don't know what is the sentiment of the comment, we will drop the nan value row. It won't cost us much as its only 1 row.

In [24]:
df.dropna(inplace = True)

In [25]:
df.label[df.label.isnull()]

Series([], Name: label, dtype: object)

In [37]:
df[df.duplicated(keep = False)]

Unnamed: 0,serial,comment,label


In [30]:
for i in list(new_df[new_df['label'] == 'O']['comment'].index):
    if i != 14616:
        df['comment'].loc[i] = df['comment'].loc[i][:-3]

### null values, duplicate values and wrong values are successfully handled.

### Now data processing for NLP

- Lower case
- Tokenization
- Removing special characters
- Removing stop words and punctuation
- Stemming

In [59]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [45]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nancy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nancy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nancy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
df.head()

Unnamed: 0,serial,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N


In [41]:
df.drop(['serial'], axis = 1, inplace = True) #removing serial columm.

In [161]:
df.sample(2)

Unnamed: 0,comment,label,stem_transform,lemma_transform
8548,living with a mulatto at university sounds hor...,N,live mulatto univers sound horribl would neckrop,living mulatto university sound horrible would...
24244,Hong Kongers are struggling everywhere whether...,P,hong konger struggl everywher whether activ li...,hong kongers struggling everywhere whether act...


In [60]:
def nlp_transform(text):
    text = text.lower()
    
    text = nltk.word_tokenize(text)
    
    y = []
    
    for i in text: # removing special characters
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text: # stop words and helping words removal.
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

In [100]:
nlp_transform(df['comment'][3])

'say actual elimin heeb wish natur becam extinct'

In [102]:
from nltk.stem import WordNetLemmatizer
# nltk.download('punkt') 

lemma = WordNetLemmatizer()

In [121]:
def nlp_transform2(text):
    text = text.lower()
    
    text = nltk.word_tokenize(text)
    
    y = []
    
    for i in text: # removing special characters
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text: # stop words and helping words removal.
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(lemma.lemmatize(i))
    
    return " ".join(y)

In [122]:
nlp_transform2(df['comment'][14])

'rape culture immigrant woman even see rape'

#### we will try both lemmatization and stemming in modeling

In [125]:
%%time

df['stem_transform'] = df['comment'].apply(nlp_transform)

CPU times: total: 6min 9s
Wall time: 6min 25s


In [126]:
%%time

df['lemma_transform'] = df['comment'].apply(nlp_transform2)

CPU times: total: 5min 50s
Wall time: 5min 55s


In [148]:
new2_df = df.copy()

In [152]:
new2_df.to_csv('transformed_df.csv', sep=',', index=False, encoding='utf-8')

In [163]:
df['encoded_label'] = df['label'].replace({'N':0, "P":1})

In [166]:
df['encoded_label'].value_counts()

encoded_label
0    22175
1    18968
Name: count, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, ComplementNB
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score

In [None]:
i = 17500
positive_samples = df[df['encoded_label'] == 1]
negative_samples = df[df['encoded_label'] == 0]

# Perform random subsampling for each class
# You can adjust the subsample size for each class as needed
positive_subsample = positive_samples.sample(n = i, random_state = 0)
negative_subsample = negative_samples.sample(n = i, random_state = 0)

# Combine the subsamples into a single DataFrame
subsampled_df = pd.concat([positive_subsample, negative_subsample])

# Shuffle the entire DataFrame to randomize the order of samples
subsampled_df = shuffle(subsampled_df, random_state=0).reset_index(drop=True)

In [None]:
X = subsampled_df['lemma_transform']  # Your feature data (text)
y = subsampled_df['encoded_label']  # Your labels (0 for negative sentiment, 1 for positive sentiment)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle = True)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test = tfidf_vectorizer.transform(X_test).toarray()

final_model = MultinomialNB(alpha = 20)

final_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = final_model.predict(X_test)

# Calculate precision score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
fscore = f1_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print(i, accuracy, fscore, precision)
print(conf)

In [None]:
def store_model_score(model_scores, model_name, score):
    model_scores[model_name] = score

subsample_size = [10000, 12500, 15000]
test_size = [0.2, 0.3]
random_state = [0, 2, 21, 42]
max_features = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 15000]
model_scores = {}

In [None]:
for i in subsample_size:
    for j in test_size:
        for k in random_state:

            positive_samples = df[df['encoded_label'] == 1]
            negative_samples = df[df['encoded_label'] == 0]

            # Perform random subsampling for each class
            # You can adjust the subsample size for each class as needed
            positive_subsample = positive_samples.sample(n = i, random_state = k)
            negative_subsample = negative_samples.sample(n = i, random_state = k)

            # Combine the subsamples into a single DataFrame
            subsampled_df = pd.concat([positive_subsample, negative_subsample])

            # Shuffle the entire DataFrame to randomize the order of samples
            subsampled_df = shuffle(subsampled_df, random_state=k).reset_index(drop=True)

            for l in max_features:

                # Create a DataFrame for each class (assuming 'Label' is your class column)

                # Sample text data and labels (replace with your data)
                X = subsampled_df['lemma_transform']  # Your feature data (text)
                y = subsampled_df['encoded_label']  # Your labels (0 for negative sentiment, 1 for positive sentiment)

                # Split the dataset into a training and testing set
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=j, random_state=k)

                # TF-IDF vectorization
                tfidf_vectorizer = TfidfVectorizer(max_features = l)

                X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
                X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

                # Define a dictionary with model names and their corresponding configurations
                models = {
                    # 'LogisticRegression': LogisticRegression(C=1, solver='saga'),
                    'MultinomialNB': MultinomialNB(alpha=10),
                    'GaussianNB': GaussianNB(),
                    'BernoulliNB': BernoulliNB(alpha=10),
                    'ComplementNB': ComplementNB(alpha=10),
                    # 'Supportvectormachine': SVC(kernel='linear', C=1.0, probability=True, decision_function_shape='ovr', random_state=42),
                    # 'XGBoost': xgb.XGBClassifier(n_jobs=-1)
                }

                # Evaluate precision for each model
                for model_name, model in models.items():
                    # Train the model
                    model.fit(X_train_tfidf, y_train)

                    # Make predictions on the test data
                    y_pred = model.predict(X_test_tfidf)

                    # Calculate precision score
                    precision = precision_score(y_test, y_pred)

                    store_model_score(model_scores, f'{model_name}_{i,j,k,l}', precision)

                print(j,k,l, precision)