# Memory Information

In [1]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")
# keras-team /
# keras-contrib 

ModuleNotFoundError: No module named 'psutil'

# GPU Information

In [None]:
! nvidia-smi

In [None]:
!pip install indic-nlp-library
!pip install scikit-multilearn

In [None]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

In [None]:
import sys
from indicnlp import common

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"indic_nlp_resources"

# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [None]:
import numpy as np
import pandas as pd
import csv
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
train_data = pd.read_csv('train.csv', header=0, index_col=0)
val_data = pd.read_csv('val.csv', header=0, index_col=0)

In [None]:
print(train_data.shape)
train_data.head()

In [None]:
print(val_data.shape)
val_data.head()

In [None]:
import ast

labels_set = {'defamation',
 'fake',
 'hate',
 'non-hostile',
 'offensive'}


##**Using one hot vectors of Emojis, Mentions and Hashtags**

In [None]:
# hashtags_set = defaultdict(int)

# ct=1
# for index, row in train_data.iterrows():
#   ht_list = ast.literal_eval(row['hashtags']) 
#   for ht in ht_list:
#     if hashtags_set.get(ht, 0)==0:
#       hashtags_set[ht] = ct
#       ct+=1

# emojis_set = defaultdict(int)
# ct=1
# for index, row in train_data.iterrows():
#   em_list = ast.literal_eval(row['emojis']) 
#   for em in em_list:
#     if emojis_set.get(em, 0)==0:
#       emojis_set[em] = ct
#       ct+=1

# mentions_set = defaultdict(int)
# ct=1
# for index, row in train_data.iterrows():
#   mn_list = ast.literal_eval(row['mentions']) 
#   for mn in mn_list:
#     if mentions_set.get(mn, 0)==0:
#       mentions_set[mn] = ct
#       ct+=1


##**Using per class counts for Emojis, Mentions and Hashtags**

In [None]:
hashtags_set = defaultdict(int)
emojis_set = defaultdict(int)
mentions_set = defaultdict(int)

for index, row in train_data.iterrows():
  ht_list = ast.literal_eval(row['hashtags'])
  em_list = ast.literal_eval(row['emojis'])
  mn_list = ast.literal_eval(row['mentions']) 

  for label in [x.strip() for x in row['Labels Set'].split(',')]:
    for ht in ht_list:
      hashtags_set[ht, label] +=1
    for em in em_list:
      emojis_set[em, label]+=1
    for mn in mn_list:
      mentions_set[mn, label]+=1

In [None]:
# print(len(emojis_set))
# print(len(hashtags_set))
# print(len(mentions_set))

In [None]:
# Creating Frequency Dictionary

vocab = defaultdict(int)
freqs = defaultdict(int)

for index, row in train_data.iterrows():
  for word in indic_tokenize.trivial_tokenize(row['Filtered_Post'], lang='hi'):
    vocab[word]+=1
    for label in [x.strip() for x in row['Labels Set'].split(',')]:
      freqs[word, label] +=1

len(freqs.keys())

##**Using five class frequencies and other features as one hot vectors**

In [None]:
# def generate_train_matrix(train_data):
#   train_X = np.empty((0, 3350))

#   for index, row in train_data.iterrows():
#     x = np.zeros((1, 6))
#     x[0, 0] = 1  # Bias Unit
#     word_l = indic_tokenize.trivial_tokenize(row['Filtered_Post'], lang='hi')
    
#     for word in word_l:
#         x[0,1] += freqs.get((word,'defamation'),0)
#         x[0,2] += freqs.get((word, 'fake'),0)
#         x[0,3] += freqs.get((word,'hate'),0)
#         x[0,4] += freqs.get((word,'non-hostile'),0)
#         x[0,5] += freqs.get((word,'offensive'),0)

#     ht_list = ast.literal_eval(row['hashtags'])
#     hash_x = np.zeros((1, 1905))
#     for ht in ht_list:
#       idx = hashtags_set.get(ht, 0)
#       if idx!=0:
#         hash_x[0, idx-1] = 1

#     em_list = ast.literal_eval(row['emojis'])
#     emojis_x = np.zeros((1, 162))
#     for em in em_list:
#       idx = emojis_set.get(em, 0)
#       if idx!=0:
#         emojis_x[0, idx-1] = 1

#     mn_list = ast.literal_eval(row['mentions'])
#     mentions_x = np.zeros((1, 1277)) 
#     for mn in mn_list:
#         idx = mentions_set.get(mn, 0)
#         if idx!=0:
#           mentions_x[0, idx-1] = 1

#     x = np.hstack((x, hash_x, emojis_x, mentions_x))
    
#     train_X = np.vstack((train_X, x))

#   return train_X

##**Using only 5 class frequencies added up**

In [None]:
def generate_train_matrix(train_data):
  train_X = np.empty((0, 6))

  for index, row in train_data.iterrows():
    x = np.zeros((1, 6))
    x[0, 0] = 1  # Bias Unit
    word_l = indic_tokenize.trivial_tokenize(row['Filtered_Post'], lang='hi')
    
    for word in word_l:
        x[0,1] += freqs.get((word,'defamation'),0)
        x[0,2] += freqs.get((word, 'fake'),0)
        x[0,3] += freqs.get((word,'hate'),0)
        x[0,4] += freqs.get((word,'non-hostile'),0)
        x[0,5] += freqs.get((word,'offensive'),0)

    ht_list = ast.literal_eval(row['hashtags'])
    for ht in ht_list:
      x[0,1] += hashtags_set.get((ht,'defamation'),0)
      x[0,2] += hashtags_set.get((ht, 'fake'),0)
      x[0,3] += hashtags_set.get((ht,'hate'),0)
      x[0,4] += hashtags_set.get((ht,'non-hostile'),0)
      x[0,5] += hashtags_set.get((ht,'offensive'),0)

    em_list = ast.literal_eval(row['emojis'])
    for em in em_list:
      x[0,1] += emojis_set.get((em,'defamation'),0)
      x[0,2] += emojis_set.get((em, 'fake'),0)
      x[0,3] += emojis_set.get((em,'hate'),0)
      x[0,4] += emojis_set.get((em,'non-hostile'),0)
      x[0,5] += emojis_set.get((em,'offensive'),0)

    mn_list = ast.literal_eval(row['mentions'])
    for mn in mn_list:
      x[0,1] += mentions_set.get((mn,'defamation'),0)
      x[0,2] += mentions_set.get((mn, 'fake'),0)
      x[0,3] += mentions_set.get((mn,'hate'),0)
      x[0,4] += mentions_set.get((mn,'non-hostile'),0)
      x[0,5] += mentions_set.get((mn,'offensive'),0)
  
    train_X = np.vstack((train_X, x))

  return train_X

In [None]:
train_X = generate_train_matrix(train_data)
val_X = generate_train_matrix(val_data)

In [None]:
# Label Mapping
labels_mapping = {'defamation':0,
 'fake':1,
 'hate':2,
 'non-hostile':3,
 'offensive':4}


In [None]:
print(train_X.shape)
print(val_X.shape)

In [None]:
train_y = np.empty((0, 5))
for index, row in train_data.iterrows():
  y = np.zeros((1, 5))
  for label in row['Labels Set'].split(','):
    y[0, labels_mapping[label]] = 1

  train_y = np.vstack((train_y, y))


val_y = np.empty((0, 5))
for index, row in val_data.iterrows():
  y = np.zeros((1, 5))
  for label in row['Labels Set'].split(','):
    y[0, labels_mapping[label]] = 1

  val_y = np.vstack((val_y, y))




In [None]:
print(train_y.shape)
print(val_y.shape)

##**Binary Relevance Model**

In [2]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

classifier = BinaryRelevance(LogisticRegression(max_iter=150))
classifier.fit(train_X, train_y)

ModuleNotFoundError: No module named 'skmultilearn'

In [None]:
predictions = classifier.predict(val_X)

In [None]:
def evaluation(y_true, y_pred):
  print("Fine Grained Accuracy = {}".format(accuracy_score(y_true, y_pred)))
  print("\n\nFine Grained Metrics\n")
  print(classification_report(y_true, y_pred))

  # y_true_coarse = np.zeros((y_true.shape[0], 2))
  # y_pred_coarse = np.zeros((y_true.shape[0], 2))

  # y_true_coarse_1 = y_true[:,3] 
  # y_true_coarse_0 = 1 - y_true_coarse[:,1]
  # y_true_coarse = np.hstack((y_true_coarse_0, y_true_coarse_1))
  
  # y_pred_coarse_1 = y_pred[:,3]
  # y_pred_coarse_0 = 1- y_pred_coarse[:,1]
  # y_pred_coarse = np.hstack((y_pred_coarse_0, y_pred_coarse_1))

  # print("Coarse Grained Accuracy = {}".format(accuracy_score(y_true_coarse, y_pred_coarse)))
  # print("\n\nCoarse Grained Metrics\n")
  # print(classification_report(y_true_coarse, y_pred_coarse))



In [None]:
evaluation(val_y, predictions)