# Problem Statement
```
The objective of this task is to detect hate speech in tweets. For the sake of simplicity, we say a tweet contains hate speech if it has a racist or sexist sentiment associated with it. So, the task is to classify racist or sexist tweets from other tweets.

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset
```

# Evaluation Metric:
```
The metric used for evaluating the performance of classification model would be F1-Score.


True Positives (TP) - These are the correctly predicted positive values which means that the value of actual class is yes and the value of predicted class is also yes.

True Negatives (TN) - These are the correctly predicted negative values which means that the value of actual class is no and value of predicted class is also no.

False Positives (FP) – When actual class is no and predicted class is yes.

False Negatives (FN) – When actual class is yes but predicted class in no.

Precision = TP/TP+FP

Recall = TP/TP+FN
```

## F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
print('Shape of Train: ', train.shape)
train.head()

Shape of Train:  (31962, 3)


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
print('Shape of Test: ', test.shape)

Shape of Test:  (17197, 2)


In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.remove('not')
stop_words.remove('no')
lemm = WordNetLemmatizer()
def text_cleaning(text):
    # For special characters
    text = re.sub(r"[$&+,:;=?@#|'<>.-^*()%!]", '', text)
    
    text = re.sub(r"[^\x00-\x7F]+", '', text)
    
    text = text.split()
    
    text = [lemm.lemmatize(word) for word in text if word not in stop_words]
    
    text = ' '.join(text)
    
    return text
train['clean_text'] = train['tweet'].apply(lambda x: text_cleaning(x))
test['clean_text'] = test['tweet'].apply(lambda x: text_cleaning(x))

In [6]:
# from textblob import TextBlob
# def textblobb(text):
#     tb = TextBlob(text)

#     sentiment = tb.sentiment.polarity
#     if sentiment < 0:
#         return 1
#     else:
#         return 0
# train['sentiment'] = train['clean_text'].apply(lambda x: textblobb(x))

In [7]:
train.head()

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drag kid dys...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,3,0,bihday your majesty,bihday majesty
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur
4,5,0,factsguide: society now #motivation,factsguide society motivation


In [8]:
train.drop(['id', 'tweet'], axis=1, inplace=True)
test.drop(['id', 'tweet'], axis=1, inplace=True)

In [12]:
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the tweets and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()
    
    #Drop the stopwords during the frequency calculation
    word_freq = word_freq.drop(stop_words, errors='ignore')
    
    #Print top 20 words
    #word_freq[:20]
    
    return word_freq
word_freq = gen_freq(train['clean_text'].str)

In [15]:
#100 most rare words in the dataset
rare_100 = word_freq[-100:]

train['word_count'] = train['clean_text'].str.split().apply(lambda x: len(x))

word_freq = gen_freq(test.clean_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]

test['word_count'] = test['clean_text'].str.split().apply(lambda x: len(x))

train.head()

Unnamed: 0,label,clean_text,word_count
0,0,user father dysfunctional selfish drag kid dys...,8
1,0,user user thanks lyft credit cant use cause do...,15
2,0,bihday majesty,2
3,0,model love u take u time ur,7
4,0,factsguide society motivation,3


In [16]:
#Check whether a negation term is present in the text
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0

#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [18]:
#Negation present or not
train['any_neg'] = train.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
train['is_question'] = train.clean_text.str.split().apply(lambda x: is_question(x))

#Negation present or not
test['any_neg'] = test.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
test['is_question'] = test.clean_text.str.split().apply(lambda x: is_question(x))

train.head()

Unnamed: 0,label,clean_text,word_count,any_neg,is_question
0,0,user father dysfunctional selfish drag kid dys...,8,0,0
1,0,user user thanks lyft credit cant use cause do...,15,0,0
2,0,bihday majesty,2,0,0
3,0,model love u take u time ur,7,0,0
4,0,factsguide society motivation,3,0,0


# Splitting the dataset

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['clean_text'], train['label'], test_size=0.2, random_state=42)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
cv = CountVectorizer(stop_words='english')
tf = TfidfTransformer(norm='l2',sublinear_tf=True)

In [21]:
X_traincv = cv.fit_transform(X_train)
X_traintf = tf.fit_transform(X_traincv)

X_testcv = cv.transform(X_test)
X_testtf = tf.transform(X_testcv)

In [22]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(X_traintf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
y_pred = model.predict(X_testtf)

In [25]:
accuracy_score(y_test, y_pred)

0.9585484123259815

In [28]:
confusion_matrix(y_test, y_pred)

array([[5907,   30],
       [ 235,  221]], dtype=int64)

In [29]:
f1_score(y_test, y_pred)

0.6251768033946251

In [30]:
train_cv = cv.fit_transform(train['clean_text'])
test_cv = cv.transform(test['clean_text'])

In [31]:
train_tf = tf.fit_transform(train_cv)
test_tf = tf.transform(test_cv)

In [32]:
model.fit(train_tf, train['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
test_pred = model.predict(test_tf)

In [34]:
sub = pd.read_csv('sample_submission.csv')
sub['label'] = test_pred
sub.to_csv('sample_submission.csv', index=False)

In [35]:
# CatBoost

In [43]:
from lightgbm import LGBMClassifier
cb = LGBMClassifier()
cb.fit(X_traintf, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [44]:
y_pred = cb.predict(X_testtf)

In [45]:
f1_score(y_test, y_pred)

0.5285714285714286