In [75]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load train, Validation & test Dataset

In [76]:
df_train = pd.read_csv('/kaggle/input/hostility-detection/constraint_Hindi_Train - Sheet1.csv')
df_val = pd.read_csv('/kaggle/input/hostility-detection/Constraint_Hindi_Valid - Sheet1.csv')
df_test = pd.read_csv('/kaggle/input/hostility-detection/Test Set Complete - test.csv')

In [77]:
df_train.head()

In [78]:
df_train.shape

In [79]:
df_train.drop(['Unique_ID'], axis=1, inplace=True)

In [80]:
df_train.head()

In [81]:
#checking the count of the dependent variable
df_train['Labels_Set'].value_counts()

# Preprocessing 

In [82]:
#library that contains punctuation
import string
string.punctuation

In [83]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [84]:
# Store punctuation free text
df_train['Post'] = df_train['Post'].apply(lambda x: remove_punctuation(x))
df_train.head()

In [85]:
df_train.head(5)

In [88]:
df_train.head(4)

In [89]:
# Split the data
X = df_train['Post']
y = df_train['Labels_Set']

# Encoding
Label encode the target variable — This is done to transform Categorical data of string type in the data set into numerical values which the model can understand.

In [90]:
from sklearn.preprocessing import MultiLabelBinarizer
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)
y

In [91]:
labels = ['hate,offensive', 'non-hostile', 'defamation,offensive', 'fake',
       'hate', 'offensive', 'fake,hate', 'defamation', 'defamation,hate',
       'defamation,hate,offensive', 'defamation,fake,offensive',
       'fake,offensive', 'defamation,fake', 'defamation,fake,hate',
       'fake,hate,offensive', 'defamation,fake,hate,offensive']

In [92]:
pd.DataFrame(y, columns=labels)

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', max_features=10000, ngram_range=(1,3))
X = tfidf.fit_transform(df_train['Post'])

In [98]:
X[2].toarray()

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Metrics for Multi-label classification

In [100]:
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
    return jaccard.mean()*100


def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print('Jacard score: {}'.format(j_score(y_test, y_pred)))
    print('----')

# OneVsRest Classifier

In [102]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [103]:
for classifier in [LinearSVC(C=1.5, penalty = 'l1', dual=False)]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

# Model Test with Real Data

In [104]:

x = ['भगवान बुद्ध के पुण्य स्थलों की दृष्टि से उत्तर प्रदेश अति समृद्ध है। आदरणीय PM श्री @narendramodi जी की अनुकंपा से भगवान बुद्ध की महापरिनिर्वाण स्थली कुशीनगर में अति शीघ्र अंतरराष्ट्रीय एयरपोर्ट क्रियाशील हो जाएगा।  पर्यटन विकास और रोजगार की दृष्टि से यह एक महत्वपूर्ण प्रयास है। https://t.co/Hovo52skR6']

In [105]:
xt = tfidf.transform(x)

In [106]:
xt

In [107]:
clf.predict(xt)

In [108]:
multilabel.inverse_transform(clf.predict(xt))

In [110]:
multilabel.classes_

In [111]:
from sklearn.metrics import precision_recall_fscore_support as score, precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred, pos_label='positive', average='micro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred,pos_label='positive', average='micro')
print('Recall: %f' % recall)
# f1: tp / (tp + fp + fn)
f1 = f1_score(y_test, y_pred, pos_label='positive', average='micro')
print('F1 score: %f' % f1)

In [112]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss
ModelsPerformance = {}

#def metricsReport(clf, y_test, y_pred):
macro_f1 = f1_score(y_test, y_pred, average='macro')
print('Macro F1 score: %f' % macro_f1)
micro_f1 = f1_score(y_test, y_pred, average='micro')
print('Micro F1 score: %f' % micro_f1)
hamLoss = hamming_loss(y_test, y_pred)
print('Ham Loss score: %f' % hamLoss)

# Random Forest Classifier

In [114]:
from sklearn.ensemble import RandomForestClassifier
rclf = RandomForestClassifier(max_depth=2, random_state=0)

In [115]:
rclf.fit(X_train, y_train)

In [116]:
y_pred = rclf.predict(X_test)

In [117]:
print_score(y_pred, rclf)

**Model Evaluation**

In [118]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss

#def metricsReport(rclf, y_test, y_pred):
macro_f1 = f1_score(y_test, y_pred, average='macro')
print('Macro F1 score: %f' % macro_f1)
micro_f1 = f1_score(y_test, y_pred, average='micro')
print('Micro F1 score: %f' % micro_f1)
hamLoss = hamming_loss(y_test, y_pred)
print('Ham Loss score: %f' % hamLoss)

In [119]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
                                                        y_pred[:, i])
    average_precision[i] = average_precision_score(y_test[:, i], y_pred[:, i])

# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(),
    y_pred.ravel())
average_precision["micro"] = average_precision_score(y_test, y_pred,
                                                     average="micro")
print('Average precision score, micro-averaged over all classes: {0:0.2f}'
      .format(average_precision["micro"]))