In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stop_words = ['हूँ', 'हो','हूं', 'मैं','में','तू', 'है', 'हैं','अथव', 'अद', 'अध', 'अन', 'अपन', 'अभ', 'अल', 'आग', 'आद', 'आपक', 'इत', 'इतय', 'इनक', 'इनस', 'इसक', 'इसम', 'इसल', 'उनक', 'उनस', 'उसक', 'एव', 'ऐस', 'कभ', 'करत', 'करन', 'कह', 'कहत', 'गय', 'जबक', 'जर', 'जह', 'झक', 'तथ', 'तन', 'तर', 'दब', 'दर', 'दव', 'धर', 'नक', 'नस', 'नह', 'पड', 'पहल', 'बड', 'बन', 'बह', 'यत', 'यद', 'यम', 'रख', 'रत', 'रव', 'रह', 'रहत', 'लक', 'वग', 'वय', 'वर', 'वग़', 'सक', 'सकत', 'सबस', 'सभ', 'सम', 'सर', 'सस', 'हमन', 'हर', 'था', 'दें', 'थी','ले', 'लो', 'थे', 'होगा', 'होगी', 'होंगे', 'ख़ास', 'बहुत', 'बार', 'वाले', 'वाली', 'वाला', 'जब', 'जहाँ', 'जा', 'जिस', 'जिन्हें', 'जिन्हों', 'जिसे', 'जिसका', 'जिसकी','जिसके', 'जिसमें', 'जिधर', 'के', 'का', 'की', 'को', 'कि', 'इस', 'उस', 'उसे', 'उन', 'उन्हें', 'उन्हों', 'उनका', 'उनकी', 'उनके','उनसे', 'अपना', 'अपनी', 'अपने', 'आदि', 'इत्यादि', 'इन्हें', 'इन्हों', 'इनका', 'इनकी', 'इनके', 'इनसे', 'जैसा', 'जैसे','अंदर', 'अत', 'अदि', 'अप', 'अपना', 'अपनि', 'अपनी', 'अपने', 'अभि', 'अभी', 'आदि', 'आप', 'इंहिं', 'इंहें', 'इंहों', 'इतयादि', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकि', 'इसकी', 'इसके', 'इसमें', 'इसि', 'इसी', 'इसे', 'उंहिं', 'उंहें', 'उंहों', 'उन', 'उनका', 'उनकि', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसि', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'एसे', 'ऐसे', 'ओर', 'और', 'कइ', 'कई', 'कर', 'करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफि', 'काफ़ी', 'कि', 'किंहें', 'किंहों', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसि', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोइ', 'कोई', 'कोन', 'कोनसा', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जहां', 'जा', 'जिंहें', 'जिंहों', 'जितना', 'जिधर', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जेसा', 'जेसे', 'जैसा', 'जैसे' , 'तैसा', 'तैसे', 'इसलिए', 'इसके अलावा', 'फिर', 'अगर', 'कि', 'की', 'के बारे में', 'किसी तरह', 'कोई', 'कुछ', 'कुल','जितना', 'तक', 'तो', 'थी', 'थे', 'था', 'ने', 'पर', 'जा', 'जो', 'सबसे', 'संग','से', 'तक', 'साथ', 'ही', 'हुआ', 'हुई', 'हुए', 'होता', 'होती', 'ह']
# remove duplicate stop words
stop_set = set()
for word in stop_words:
  stop_set.add(word)
print("No. of stop words: ", len(stop_set))

No. of stop words:  235


In [5]:
with open('/content/drive/MyDrive/Colab Notebooks/final_stopwords.txt', 'r', encoding='utf8') as file:
    for line in file:
        word = line.strip()  # remove newline character from the end of the line
        stop_set.add(word)  # add the word to the set
print("No. of stop words: ", len(stop_set))

No. of stop words:  422


In [6]:
def remove_stopwords_hindi(text):
    tokens = word_tokenize(text)
    # print(tokens)
    filtered_tokens = [word for word in tokens if word not in stop_set]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def remove_stopwords_english(text):
    tokens = word_tokenize(text)
    # print(tokens)
    filtered_tokens = [word for word in tokens if word not in set(stopwords.words('english'))]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [7]:
# removing punctuations
def remove_punctuations(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

In [8]:
# # tokenize and check unique words
def tokenize_unique_save(col):
  unique = set()
  for cell in col:
    tokens = word_tokenize(cell)
    for token in tokens:
      unique.add(token)
  return unique

# unique_list = list()
# with open('/content/drive/MyDrive/Colab Notebooks/unique.txt', 'w') as file:
#   for word in tokenize_unique_save(df['text']):
#     file.write(str(word)+'\n')
#   file.close()

# print(tokenize_unique_save(df['text']))

In [9]:
!pip install indic_transliteration emot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting indic_transliteration
  Downloading indic_transliteration-2.3.44-py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting roman
  Downloading roman-4.0-py3-none-any.whl (7.8 kB)
Collecting backports.functools-lru-cache
  Downloading backports.functools_lru_cache-1.6.4-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: emot, roman, backports.functools-lru-cache, indic_transliteration
Successfully installed backports.functools-lru-cache-1.6.4 emot-3.1 indic_transliteration-2.3.44 roman-4.0


In [10]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Define preprocessing functions
def preprocess_hindi_text(text):
    # Remove unnecessary symbols
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    return text

def transliterate_hindi(text):
    # Transliterate Hindi text to English
    english_text = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
    return english_text.lower()

import unicodedata

def extract_emojis(text):
    # Extract emojis from text
    emojis = ''.join(c for c in text if c in ''.join(chr(i) for i in range(0x1F300, 0x1F6FF+1)))
    return emojis

def extract_raw_english(text):
    # Remove unnecessary symbols
    text = re.sub(r'[^a-z^A-Z]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    return text

In [11]:
from emot.emo_unicode import UNICODE_EMOJI
 # Function for converting emojis into word
def convert_emojis(text):
    for e in UNICODE_EMOJI:
        text = text.replace(e, (UNICODE_EMOJI[e]+' ').replace(':',''))
    return text

convert_emojis('😍😎👸👈🎠👫👸👈👗💃👀👩😋😍👈👸👰 पगली बहोत शोकीन तुजे डोली बिठाके लेके तेरे कपड़...')

'smiling_face_with_heart-eyes smiling_face_with_sunglasses princess backhand_index_pointing_left carousel_horse woman_and_man_holding_hands princess backhand_index_pointing_left dress woman_dancing eyes woman face_savoring_food smiling_face_with_heart-eyes backhand_index_pointing_left princess person_with_veil  पगली बहोत शोकीन तुजे डोली बिठाके लेके तेरे कपड़...'

In [12]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/abusive_text.csv')
df.head()

Unnamed: 0,label,text
0,0,भीड़ में बहुत हब्सी मिलेंगे
1,0,साले बेवकूफ अपनी मां मक्खियां तो हटा दें
2,0,बुर देदो तो मुह में लंड ले लो तो
3,0,कुत्ता वहा है चिल्ला तू क्यों रहा है
4,1,चाय नहीं पीता हूं मैं इसी को छोड़ दिया ok


In [18]:
############################################################################# START PRE_PROCESSING ###################################################################################

# remove punctuation from text
df['Clean Text'] = df['text'].apply(remove_punctuations)

# convert emoji to text
df['Clean Text'] = df['Clean Text'].apply(convert_emojis)

# remove digits from text
df['Clean Text'] = df['Clean Text'].str.replace('\d+',' ')

# remove stop words from hindi
df['Clean Text'] = df['Clean Text'].apply(remove_stopwords_hindi)

df['final_text'] = df['Clean Text']

  df['Clean Text'] = df['Clean Text'].str.replace('\d+',' ')


In [19]:
traindf, valdf = train_test_split(df, train_size=0.8)

In [None]:
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import torch
import torch.nn as nn
from torchtext import data
from torchtext import datasets
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable 
from tqdm import tqdm

from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [22]:
def build_vocab(data):
  for text in data:
        yield word_tokenize(text)

In [23]:
# max sequence length
max_seq_length = 100
# build vocab
vocab = build_vocab_from_iterator(build_vocab(df['final_text']), specials=["UNK"])
# The vocabulary block converts a list of tokens into integers.
vocab.set_default_index(vocab["UNK"])

In [24]:
# encoding each sentence as a sequence of integer
encoded_train_data = []
def encode_word2int(data):
  word2int = []
  for text in data:
    tokens = word_tokenize(text)
    word2int.append([vocab[word] for word in tokens])
  return word2int

In [28]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hindi_test.csv')

In [29]:
# preprocess data
# remove punctuation from text
test_df['Clean Text'] = test_df['text'].apply(remove_punctuations)

# convert emoji to text
test_df['Clean Text'] = test_df['Clean Text'].apply(convert_emojis)

# remove digits from text
test_df['Clean Text'] = test_df['Clean Text'].str.replace('\d+',' ')

# remove stop words from hindi
test_df['Clean Text'] = test_df['Clean Text'].apply(remove_stopwords_hindi)

test_df['final_text'] = test_df['Clean Text']

test_df.head(10)

  test_df['Clean Text'] = test_df['Clean Text'].str.replace('\d+',' ')


Unnamed: 0,label,text,Clean Text,final_text
0,0,मैं ये नही सोच रहा की इसे निकले कैसे मैं ये सो...,नही सोच निकले सोच फंसा कैसेface_with_tears_of_...,नही सोच निकले सोच फंसा कैसेface_with_tears_of_...
1,1,और दिवाली में भी पूरा देश पड़ाका नहीं फोडात,दिवाली देश पड़ाका फोडात,दिवाली देश पड़ाका फोडात
2,1,कुत्ता बिल्ली पाल लेना मगर गलत फहमी कभी नहीं।,कुत्ता बिल्ली पाल लेना गलत फहमी नहीं।,कुत्ता बिल्ली पाल लेना गलत फहमी नहीं।
3,0,तेरी गांड में प्याज काट देगा गुज्जर भोसड़ी के,तेरी गांड प्याज काट देगा गुज्जर भोसड़ी,तेरी गांड प्याज काट देगा गुज्जर भोसड़ी
4,1,बंगाली साड़ी ऐसे नहीं पहना जाता है दीदी,बंगाली साड़ी पहना दीदी,बंगाली साड़ी पहना दीदी
5,1,ऐ इंडिया है यह आदमी दो बार जीता है एक बार मरत...,ऐ इंडिया आदमी जीता मरता,ऐ इंडिया आदमी जीता मरता
6,1,अक्कड़ बक्कड़ बंबे बो डीजल नब्बे पेट्रोल सौ ...,अक्कड़ बक्कड़ बंबे बो डीजल नब्बे पेट्रोल सौ सौ...,अक्कड़ बक्कड़ बंबे बो डीजल नब्बे पेट्रोल सौ सौ...
7,1,एक तीर एक कमान आदिवासी एक समान एक तीर एक कमान ...,तीर कमान आदिवासी तीर कमान जय श्रीराम जय श्रीरा...,तीर कमान आदिवासी तीर कमान जय श्रीराम जय श्रीरा...
8,1,आपका बहुत बड़ा फैन हूं असद ओवैसी साहब मैं आपका...,फैन असद ओवैसी साहब फैन मुजम्मिल थाली number,फैन असद ओवैसी साहब फैन मुजम्मिल थाली number
9,0,तुम सब चूतिया हो रोटी राम,सब चूतिया रोटी राम,सब चूतिया रोटी राम


In [27]:
# Define the LSTM model architecture
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, patience):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.patience = patience
        self.counter=0
        self.early_stop=False
        self.min_delta = 5
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.3, num_layers = num_layers)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.3)
        self.sig = nn.Sigmoid()
    
    def forward(self, x, h):
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, h)
        # print(lstm_out.shape)
        lstm_out = lstm_out[:, -1, :] # getting the last time step output
        lstm_out = self.dropout(lstm_out)
        # fully-connected layer
        out = self.fc(lstm_out)
        # out = self.fc2(out)
        # sigmoid function
        out = self.sig(out)
        # return last sigmoid output
        return out

    def earlystop(self, validation_loss, train_loss):
      if (validation_loss - train_loss) > self.min_delta:
        self.counter +=1
        if self.counter >= self.patience:  
            self.early_stop = True
    
    def getearlystop(self):
      return self.early_stop
    
    def getearlystopcnt(self):
      return self.counter

    def getpatience(self):
      return self.patience
    
    def incearlystopcnt(self):
      self.counter += 1
    
    def setearlystopcnt(self):
      self.counter += 0

In [30]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 600
num_layers = 2

epochs = 10
lr = 0.001 # learning rate

In [31]:
# LSTM
def testLSTM(testdf):
  # Load the saved LSTM model
  model_path = '/content/drive/MyDrive/Colab Notebooks/models/lstm.pth'
  model = torch.load(model_path)
  model.to(device)
  model.eval()

  total_f1 = 0
  nb_eval_steps = 0
  # encode to int
  encoded_train_data = encode_word2int(testdf['final_text'])
  labels = np.array(testdf['label'])
  # maximum sequence length
  MAX_SEQ_LEN = 100
  # Padding the sentences
  padded_X = []
  for sentence in encoded_train_data:
    if len(sentence) > MAX_SEQ_LEN:
      padded_X.append(sentence[:MAX_SEQ_LEN])
    else:
      padded_X.append([0]*(MAX_SEQ_LEN-len(sentence)) + sentence)
  padded_X = np.array(padded_X)
  test_set = TensorDataset(torch.from_numpy(padded_X), torch.from_numpy(labels))
  # create a data loader
  test_loader = DataLoader(test_set, batch_size=32, pin_memory=True,num_workers=2, shuffle=False)
  # perform evaluation loop on batches
  val_loss = []
  val_acc= []
  running_loss_val = 0
  correct_val = 0
  total_val = 0
  total_step_val = len(test_loader)
  with torch.no_grad():
    for texts, labels in tqdm(test_loader):
      texts = texts.to(device) 
      labels = labels.to(device) 
      bs = labels.shape[0]  
      zero_init = torch.zeros(num_layers,bs,hidden_dim).to(device)

      h = tuple([zero_init, zero_init]) 

      preds = model(texts, h)
      loss = nn.BCELoss()(preds.squeeze(), labels.float())
      # val_loss.append(loss.item())
      running_loss_val += loss.item()

      # y_pred_val = torch.argmax(preds, dim=1)
      preds = torch.round(preds.squeeze())
      # y_val_list.extend(preds.tolist())
      # print(preds.tolist())

      correct_val += torch.sum(preds==labels).item()
      total_val += labels.size(0)
      output_preds_cpu=preds.cpu()
      total_f1 += f1_score(labels.cpu(), preds.tolist(), average='macro')
      nb_eval_steps += 1
      # accuracy = torch.tensor(torch.sum(preds == labels).item() / len(preds)).item()        
      # accs.append(accuracy)

  val_loss.append(running_loss_val / total_step_val)
  val_acc.append(100 * correct_val / total_val)
  avg_f1 = total_f1 / nb_eval_steps
  print(f'\n\nAccuracy : {np.mean(val_acc):.3f}%')
  print(f'Macro f1-score : {avg_f1*100:.3f}%')

testLSTM(test_df)

100%|██████████| 211/211 [00:04<00:00, 51.15it/s]



Accuracy : 79.504%
Macro f1-score : 78.984%





In [32]:
def print_intersection_points(train_df, val_df, new_testdf):
  # Extract the column of interest from training and test dataframes
  train_col = set(train_df["text"])
  test_col = set(new_testdf["text"])

  # Count the number of common rows between the two dataframes
  common_rows = len(train_col.intersection(test_col))
  print("Number of common rows between train and test:", common_rows)

  # Extract the column of interest from validation and test dataframes
  valid_col = set(val_df["text"])

  # Count the number of common rows between the two dataframes
  common_rows = len(valid_col.intersection(test_col))
  print("Number of common rows between validation and test:", common_rows)

print_intersection_points(traindf, valdf, test_df)

Number of common rows between train and test: 0
Number of common rows between validation and test: 0
