# Memory Information

In [274]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")
# keras-team /
# keras-contrib 

Total: 25.51GB
Available: 24.36GB
Used: 1.63GB
Percentage: 4.5%


# GPU Information

In [275]:
! nvidia-smi

Thu Oct 22 19:42:39 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [276]:
!pip install indic-nlp-library
!pip install scikit-multilearn



In [277]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

fatal: destination path 'indic_nlp_resources' already exists and is not an empty directory.


In [278]:
import sys
from indicnlp import common

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"indic_nlp_resources"

# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [279]:
import numpy as np
import pandas as pd
import csv
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [280]:
train_data = pd.read_csv('train.csv', header=0, index_col=0)
val_data = pd.read_csv('val.csv', header=0, index_col=0)

In [281]:
print(train_data.shape)
train_data.head()

(5727, 10)


Unnamed: 0_level_0,Post,Labels Set,emails,urls,mentions,hashtags,emojis,emoticons,reserved_words,Filtered_Post
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...,"hate,offensive",[],[],[],[],"['🙏', '🙏']",[],[],मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...
2,सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...,non-hostile,[],['https://t.co/8iy2MJSBAs'],[],[],[],[],[],सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...
3,"सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील...",non-hostile,[],"['https://t.co/G945HvzM0Z', 'https://t.co/KfH7...",[],['LIVE'],[],[],[],"सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील..."
4,@prabhav218 साले जेएनयू छाप कमिने लोग हिन्दुओं...,"defamation,offensive",[],[],['@prabhav218'],[],[],[],[],साले जेएनयू छाप कमिने लोग हिन्दुओं को यह कहते...
5,#unlock4guidelines - अनलॉक-4 के लिए गाइडलाइन्स...,non-hostile,[],['https://t.co/4e6lysg0VR'],[],['unlock4guidelines'],[],[],[],- अनलॉक-4 के लिए गाइडलाइन्स जारी\n\n- 7 सितंब...


In [282]:
print(val_data.shape)
val_data.head()

(811, 10)


Unnamed: 0_level_0,Post,Labels Set,emails,urls,mentions,hashtags,emojis,emoticons,reserved_words,Filtered_Post
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,दृढ़ इच्छा शक्ति से परिपूर्ण प्रणबदा के लिए दे...,non-hostile,[],[],[],[],[],[],[],दृढ़ इच्छा शक्ति से परिपूर्ण प्रणबदा के लिए दे...
2,भारतीय जनता पार्टी rss वाले इतने गिरे हुए हैं ...,defamation,[],[],[],[],[],[],[],भारतीय जनता पार्टी rss वाले इतने गिरे हुए हैं ...
3,कोरोना से निपटने की तैयारी / दिल्ली में 10 हजा...,non-hostile,[],['https://t.co/9rlQowAsFh'],"['@ArvindKejriwal', '@rajnathsingh', '@AmitSha...",['Delhi'],[],[],[],कोरोना से निपटने की तैयारी / दिल्ली में 10 हजा...
4,गवर्नर कॉन्फ्रेंस में PM मोदी बोले- शिक्षा नीत...,non-hostile,[],['https://t.co/ZvKgxk6dbd'],[],[],[],[],[],गवर्नर कॉन्फ्रेंस में PM मोदी बोले- शिक्षा नीत...
5,"यूपी: गाजीपुर में Toilet घोटाला, प्रधान व सचिव...",non-hostile,[],['https://t.co/hxM1uNNmX2'],[],['UP'],[],[],[],"यूपी: गाजीपुर में Toilet घोटाला, प्रधान व सचिव..."


In [283]:
import ast

labels_set = {'defamation',
 'fake',
 'hate',
 'non-hostile',
 'offensive'}


##**Using one hot vectors of Emojis, Mentions and Hashtags**

In [284]:
# hashtags_set = defaultdict(int)

# ct=1
# for index, row in train_data.iterrows():
#   ht_list = ast.literal_eval(row['hashtags']) 
#   for ht in ht_list:
#     if hashtags_set.get(ht, 0)==0:
#       hashtags_set[ht] = ct
#       ct+=1

# emojis_set = defaultdict(int)
# ct=1
# for index, row in train_data.iterrows():
#   em_list = ast.literal_eval(row['emojis']) 
#   for em in em_list:
#     if emojis_set.get(em, 0)==0:
#       emojis_set[em] = ct
#       ct+=1

# mentions_set = defaultdict(int)
# ct=1
# for index, row in train_data.iterrows():
#   mn_list = ast.literal_eval(row['mentions']) 
#   for mn in mn_list:
#     if mentions_set.get(mn, 0)==0:
#       mentions_set[mn] = ct
#       ct+=1


##**Using per class counts for Emojis, Mentions and Hashtags**

In [285]:
hashtags_set = defaultdict(int)
emojis_set = defaultdict(int)
mentions_set = defaultdict(int)

for index, row in train_data.iterrows():
  ht_list = ast.literal_eval(row['hashtags'])
  em_list = ast.literal_eval(row['emojis'])
  mn_list = ast.literal_eval(row['mentions']) 

  for label in [x.strip() for x in row['Labels Set'].split(',')]:
    for ht in ht_list:
      hashtags_set[ht, label] +=1
    for em in em_list:
      emojis_set[em, label]+=1
    for mn in mn_list:
      mentions_set[mn, label]+=1

In [286]:
# print(len(emojis_set))
# print(len(hashtags_set))
# print(len(mentions_set))

In [287]:
# Creating Frequency Dictionary

vocab = defaultdict(int)
freqs = defaultdict(int)

for index, row in train_data.iterrows():
  for word in indic_tokenize.trivial_tokenize(row['Filtered_Post'], lang='hi'):
    vocab[word]+=1
    for label in [x.strip() for x in row['Labels Set'].split(',')]:
      freqs[word, label] +=1

len(freqs.keys())

35502

##**Using five class frequencies and other features as one hot vectors**

In [288]:
# def generate_train_matrix(train_data):
#   train_X = np.empty((0, 3350))

#   for index, row in train_data.iterrows():
#     x = np.zeros((1, 6))
#     x[0, 0] = 1  # Bias Unit
#     word_l = indic_tokenize.trivial_tokenize(row['Filtered_Post'], lang='hi')
    
#     for word in word_l:
#         x[0,1] += freqs.get((word,'defamation'),0)
#         x[0,2] += freqs.get((word, 'fake'),0)
#         x[0,3] += freqs.get((word,'hate'),0)
#         x[0,4] += freqs.get((word,'non-hostile'),0)
#         x[0,5] += freqs.get((word,'offensive'),0)

#     ht_list = ast.literal_eval(row['hashtags'])
#     hash_x = np.zeros((1, 1905))
#     for ht in ht_list:
#       idx = hashtags_set.get(ht, 0)
#       if idx!=0:
#         hash_x[0, idx-1] = 1

#     em_list = ast.literal_eval(row['emojis'])
#     emojis_x = np.zeros((1, 162))
#     for em in em_list:
#       idx = emojis_set.get(em, 0)
#       if idx!=0:
#         emojis_x[0, idx-1] = 1

#     mn_list = ast.literal_eval(row['mentions'])
#     mentions_x = np.zeros((1, 1277)) 
#     for mn in mn_list:
#         idx = mentions_set.get(mn, 0)
#         if idx!=0:
#           mentions_x[0, idx-1] = 1

#     x = np.hstack((x, hash_x, emojis_x, mentions_x))
    
#     train_X = np.vstack((train_X, x))

#   return train_X

##**Using only 5 class frequencies added up**

In [289]:
def generate_train_matrix(train_data):
  train_X = np.empty((0, 6))

  for index, row in train_data.iterrows():
    x = np.zeros((1, 6))
    x[0, 0] = 1  # Bias Unit
    word_l = indic_tokenize.trivial_tokenize(row['Filtered_Post'], lang='hi')
    
    for word in word_l:
        x[0,1] += freqs.get((word,'defamation'),0)
        x[0,2] += freqs.get((word, 'fake'),0)
        x[0,3] += freqs.get((word,'hate'),0)
        x[0,4] += freqs.get((word,'non-hostile'),0)
        x[0,5] += freqs.get((word,'offensive'),0)

    ht_list = ast.literal_eval(row['hashtags'])
    for ht in ht_list:
      x[0,1] += hashtags_set.get((ht,'defamation'),0)
      x[0,2] += hashtags_set.get((ht, 'fake'),0)
      x[0,3] += hashtags_set.get((ht,'hate'),0)
      x[0,4] += hashtags_set.get((ht,'non-hostile'),0)
      x[0,5] += hashtags_set.get((ht,'offensive'),0)

    em_list = ast.literal_eval(row['emojis'])
    for em in em_list:
      x[0,1] += emojis_set.get((em,'defamation'),0)
      x[0,2] += emojis_set.get((em, 'fake'),0)
      x[0,3] += emojis_set.get((em,'hate'),0)
      x[0,4] += emojis_set.get((em,'non-hostile'),0)
      x[0,5] += emojis_set.get((em,'offensive'),0)

    mn_list = ast.literal_eval(row['mentions'])
    for mn in mn_list:
      x[0,1] += mentions_set.get((mn,'defamation'),0)
      x[0,2] += mentions_set.get((mn, 'fake'),0)
      x[0,3] += mentions_set.get((mn,'hate'),0)
      x[0,4] += mentions_set.get((mn,'non-hostile'),0)
      x[0,5] += mentions_set.get((mn,'offensive'),0)
  
    train_X = np.vstack((train_X, x))

  return train_X

In [290]:
train_X = generate_train_matrix(train_data)
val_X = generate_train_matrix(val_data)

In [291]:
# Label Mapping
labels_mapping = {'defamation':0,
 'fake':1,
 'hate':2,
 'non-hostile':3,
 'offensive':4}


In [292]:
print(train_X.shape)
print(val_X.shape)

(5727, 6)
(811, 6)


In [293]:
train_y = np.empty((0, 5))
for index, row in train_data.iterrows():
  y = np.zeros((1, 5))
  for label in row['Labels Set'].split(','):
    y[0, labels_mapping[label]] = 1

  train_y = np.vstack((train_y, y))


val_y = np.empty((0, 5))
for index, row in val_data.iterrows():
  y = np.zeros((1, 5))
  for label in row['Labels Set'].split(','):
    y[0, labels_mapping[label]] = 1

  val_y = np.vstack((val_y, y))




In [294]:
print(train_y.shape)
print(val_y.shape)

(5727, 5)
(811, 5)


##**Binary Relevance Model**

In [295]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

classifier = BinaryRelevance(LogisticRegression(max_iter=150))
classifier.fit(train_X, train_y)

BinaryRelevance(classifier=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1,
                                              l1_ratio=None, max_iter=150,
                                              multi_class='auto', n_jobs=None,
                                              penalty='l2', random_state=None,
                                              solver='lbfgs', tol=0.0001,
                                              verbose=0, warm_start=False),
                require_dense=[True, True])

In [296]:
predictions = classifier.predict(val_X)

In [345]:
def evaluation(y_true, y_pred):
  print("Fine Grained Accuracy = {}".format(accuracy_score(y_true, y_pred)))
  print("\n\nFine Grained Metrics\n")
  print(classification_report(y_true, y_pred))

  # y_true_coarse = np.zeros((y_true.shape[0], 2))
  # y_pred_coarse = np.zeros((y_true.shape[0], 2))

  # y_true_coarse_1 = y_true[:,3] 
  # y_true_coarse_0 = 1 - y_true_coarse[:,1]
  # y_true_coarse = np.hstack((y_true_coarse_0, y_true_coarse_1))
  
  # y_pred_coarse_1 = y_pred[:,3]
  # y_pred_coarse_0 = 1- y_pred_coarse[:,1]
  # y_pred_coarse = np.hstack((y_pred_coarse_0, y_pred_coarse_1))

  # print("Coarse Grained Accuracy = {}".format(accuracy_score(y_true_coarse, y_pred_coarse)))
  # print("\n\nCoarse Grained Metrics\n")
  # print(classification_report(y_true_coarse, y_pred_coarse))



In [346]:
evaluation(val_y, predictions)

Fine Grained Accuracy = 0.45499383477188654


Fine Grained Metrics

              precision    recall  f1-score   support

           0       0.10      0.01      0.02        77
           1       0.50      0.14      0.22       160
           2       0.36      0.11      0.17       110
           3       0.73      0.80      0.77       435
           4       0.39      0.11      0.17       103

   micro avg       0.67      0.45      0.54       885
   macro avg       0.42      0.23      0.27       885
weighted avg       0.55      0.45      0.46       885
 samples avg       0.47      0.48      0.47       885



  _warn_prf(average, modifier, msg_start, len(result))
