## Importing libraries


In [None]:
#Importing pandas
import pandas as pd

#Importing numpy
import numpy as np

# Importing different functions from sklearn for future purposes.

# Used for feature extraction from text, which is then used as input for the model.
from sklearn.feature_extraction.text import TfidfVectorizer

# Importing the classifier model.
from sklearn.linear_model import LogisticRegression

# Pipeline is used to assemble several steps for machine learning into a single object,
# so that the flow can be automated.
from sklearn.pipeline import Pipeline

# The data contains inputs which belong to multiple classes, hence multi-class classifier was needed.
from sklearn.multiclass import OneVsRestClassifier

# Importing this to calculate and print different classification metrics such as precision, recall and f1 score.
from sklearn.metrics import classification_report

# To calculate the accuracy of the model in the end.
from sklearn.metrics import accuracy_score

# To calculate the loss of the model
from sklearn.metrics import log_loss

#Since, we are working with inputs which come under the domain of natural language, NLTK (natural language tool kit)
# is essential library for input processing.
import nltk

# Used to get stop words which can be removed from the inputs, in the data cleaning step.
from nltk.corpus import stopwords

# Using this to reduce the words in the input to their base form.
from nltk.stem import WordNetLemmatizer

# Importing word_tokenize to split the imput text into individual words/tokens.
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /home/kabir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kabir/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kabir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Advantages of model:
- Logistic regression model is a easily understandable algorithm, hence provides simplicity.
- Since this model does not require extensie computational resources, thus it is easy to train and is efficient.
- Has a wide range of applications. This model has been deployed successfully in various fields such as medicine and finance.

## Loading the data

In [None]:
# Loading our data using pandas library. The data is in csv format.
data = pd.read_csv('./Data/train/train.csv') #train data
X_test = pd.read_csv("./Data/test/test.csv") # test input data
y_test = pd.read_csv("./Data/test_labels/test_labels.csv") # test outputs data


## Preprocessing the data


In this step we are pre-processing the data so that we can feed that into our model.
1. we remove the stop words.
2. We lemmatize the text. (Reducing the words into their base form so that words can be standardized, and all variations in the meaning of the same word can be mapped on to one base form.)
3. Tokenize the text. (Split the text into individual words. )



In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

#Preprocessing being applied on the train inputs which are in the comment_text column.
data['comment_text'] = data['comment_text'].apply(preprocess_text)

#Creating X_train
X_train = data['comment_text']

#Creating y_train
y_train = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

#Preprocessing being applied on the train inputs which are in the comment_text column.
X_test['comment_text'] = X_test['comment_text'].apply(preprocess_text)

#Creating X_test
X_test = X_test['comment_text']

#Creating y_test
y_test = y_test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]


## Cleaning the dataset

This is an essential before we could test our model on the test inputs.

The inputs are classifed into 6 categories/classes. If the input belongs to a certain class then the value for that column will be 1. Similarly if the input doesnot belong to a particular class the value in that will be 0.

The rows that are being removed here have value of -1 in all their columns. The provider of the dataset explains that these rows were added sometime later and hence they have values of -1. Since, the train data set did not have any input of -1, we decided to remove those particular rows.

In [None]:
#Obtaining the indexes with rows with values of -1
indices_to_remove = y_test[y_test['toxic'] == -1].index

#Removing the rows from X_test
y_test_filtered = y_test.drop(indices_to_remove)
#Removing the rows from y_test
X_test_filtered = X_test.drop(indices_to_remove)

## Creating the pipeline

To simplify multiple steps of the model, we are creating a pipeline. The dataset given to the pipeline will first go through the following steps:

1. TFIDF vectoriser, for feature extraction. The output will be a feature vector.
2. The output of the vectorizer will then be used for training of the multiclass classifier. The approach being used for classification is OneVsRest (OvR), which will help extend the scope of logistic regression classifier, as it is a binary classifer.



In [None]:
#Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])

#Train the model
pipeline.fit(X_train, y_train)



Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(solver='sag'),
                                     n_jobs=1))])

## Predictions

Once the model is trained we give the model our test inputs to get the predictions.


In [None]:
#Test the model
y_pred = pipeline.predict(X_test_filtered)


## Analysis

Classification report is imported from the sklearn library. This was needed so we can test our model's outputs against the gold labels. The report gives us precision, recall and f1 score for each class. We also get macro and micro averages for all the metrics.

In [None]:
print(classification_report(y_test_filtered, y_pred))


              precision    recall  f1-score   support

           0       0.66      0.69      0.68      6090
           1       0.40      0.33      0.36       367
           2       0.77      0.61      0.68      3691
           3       0.57      0.20      0.30       211
           4       0.74      0.52      0.61      3427
           5       0.68      0.24      0.35       712

   micro avg       0.69      0.59      0.64     14498
   macro avg       0.64      0.43      0.50     14498
weighted avg       0.70      0.59      0.63     14498
 samples avg       0.06      0.05      0.06     14498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Now We Calculate the loss of the model

In [None]:
# Predict the probabilities of labels from the test data
y_pred_probs = pipeline.predict_proba(X_test_filtered)

# Calculate the log loss
loss = log_loss(y_test_filtered, y_pred_probs)


Finally, to calculate the accuracy we will be using the accuracy_score function we have imported from sklearn.

In [None]:
accuracy_score(y_test_filtered, y_pred)


0.8979805558160617

In [None]:
sentences = ["I hate black people", "I hate jewish people", "All muslims are terrorist", "Kill yourself, bitch", "Fuck you, you cunt", "I love god", "i love indian food", "My favourite flavour of ice-cream is chocolate", "God is very kind", "Kanye West is the greatest of all times"]


outputs = [[1,1,0,0,0,1],
           [1,1,0,0,0,1],
           [1,1,0,0,0,1],
           [1,1,0,0,1,0],
           [1,1,0,0,1,0],
           [0,0,0,0,0,0],
           [0,0,0,0,0,0],
           [0,0,0,0,0,0],
           [0,0,0,0,0,0],
           [0,0,0,0,0,0],
           ]
for i in range(len(sentences)):
    sentences[i] = preprocess_text(sentences[i])

sentence_pred = pipeline.predict(sentences)


print("accuracy", accuracy_score(outputs, sentence_pred))





accuracy 0.5
