<a href="https://colab.research.google.com/github/vishalgimhan/MachineLearning-Projects/blob/main/Hate_Speech_Detection_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
#import necessary libraries
import pandas as pd
import re #for text processing

from sklearn.utils import resample #to handle imbalanced datasets

from sklearn.pipeline import Pipeline #streamline the process by chaining together multiple steps
#CountVectorizer and TfidfTransformer are used to convert text data into numerical features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier #stochastic gradient descent, suitable for large-scale and sparse data

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

In [25]:
train = pd.read_csv('/content/drive/MyDrive/Datasets/hate speech_train.csv')
print("Training Set: "% train.columns, train.shape, len(train))
test = pd.read_csv('/content/drive/MyDrive/Datasets/hate speech_test.csv')
print("Test Set: "% test.columns, test.shape, len(test))

Training Set:  (31962, 3) 31962
Test Set:  (17197, 2) 17197


In [26]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [27]:
#preprocess text data
def clean_text(df, text_field):
  df[text_field] = df[text_field].str.lower()
  #Removing Unwanted Characters:
  #Mentions(@name), Non-alphanumeric characters, URLs, Retweets (indicated by ^rt), Any text starting with http
  df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
  return df
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [28]:
train_clean.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation


In [29]:
train_clean['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,29720
1,2242


In [30]:
# Split the training data into majority and minority classes
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]

# Upsample the minority class
train_minority_upsampled = resample(train_minority,
                                    replace=True, # sample with replacement
                                    n_samples=len(train_majority), # match number in majority class
                                    random_state=123)

# Combine majority class with upsampled minority class
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,29720
0,29720


In [31]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()), # Step 1: Convert text to token counts
    ('tfidf', TfidfTransformer()), # Step 2: Transform counts to TF-IDF
    ('nb', SGDClassifier()), # Step 3: Train a classifier using SGD
])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],
                                                    random_state=0)

In [33]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)

f1_score(y_test,y_predict)

0.9694061187762447