In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stop_words = ['‡§π‡•Ç‡§Å', '‡§π‡•ã','‡§π‡•Ç‡§Ç', '‡§Æ‡•à‡§Ç','‡§Æ‡•á‡§Ç','‡§§‡•Ç', '‡§π‡•à', '‡§π‡•à‡§Ç','‡§Ö‡§•‡§µ', '‡§Ö‡§¶', '‡§Ö‡§ß', '‡§Ö‡§®', '‡§Ö‡§™‡§®', '‡§Ö‡§≠', '‡§Ö‡§≤', '‡§Ü‡§ó', '‡§Ü‡§¶', '‡§Ü‡§™‡§ï', '‡§á‡§§', '‡§á‡§§‡§Ø', '‡§á‡§®‡§ï', '‡§á‡§®‡§∏', '‡§á‡§∏‡§ï', '‡§á‡§∏‡§Æ', '‡§á‡§∏‡§≤', '‡§â‡§®‡§ï', '‡§â‡§®‡§∏', '‡§â‡§∏‡§ï', '‡§è‡§µ', '‡§ê‡§∏', '‡§ï‡§≠', '‡§ï‡§∞‡§§', '‡§ï‡§∞‡§®', '‡§ï‡§π', '‡§ï‡§π‡§§', '‡§ó‡§Ø', '‡§ú‡§¨‡§ï', '‡§ú‡§∞', '‡§ú‡§π', '‡§ù‡§ï', '‡§§‡§•', '‡§§‡§®', '‡§§‡§∞', '‡§¶‡§¨', '‡§¶‡§∞', '‡§¶‡§µ', '‡§ß‡§∞', '‡§®‡§ï', '‡§®‡§∏', '‡§®‡§π', '‡§™‡§°', '‡§™‡§π‡§≤', '‡§¨‡§°', '‡§¨‡§®', '‡§¨‡§π', '‡§Ø‡§§', '‡§Ø‡§¶', '‡§Ø‡§Æ', '‡§∞‡§ñ', '‡§∞‡§§', '‡§∞‡§µ', '‡§∞‡§π', '‡§∞‡§π‡§§', '‡§≤‡§ï', '‡§µ‡§ó', '‡§µ‡§Ø', '‡§µ‡§∞', '‡§µ‡•ö', '‡§∏‡§ï', '‡§∏‡§ï‡§§', '‡§∏‡§¨‡§∏', '‡§∏‡§≠', '‡§∏‡§Æ', '‡§∏‡§∞', '‡§∏‡§∏', '‡§π‡§Æ‡§®', '‡§π‡§∞', '‡§•‡§æ', '‡§¶‡•á‡§Ç', '‡§•‡•Ä','‡§≤‡•á', '‡§≤‡•ã', '‡§•‡•á', '‡§π‡•ã‡§ó‡§æ', '‡§π‡•ã‡§ó‡•Ä', '‡§π‡•ã‡§Ç‡§ó‡•á', '‡§ñ‡§º‡§æ‡§∏', '‡§¨‡§π‡•Å‡§§', '‡§¨‡§æ‡§∞', '‡§µ‡§æ‡§≤‡•á', '‡§µ‡§æ‡§≤‡•Ä', '‡§µ‡§æ‡§≤‡§æ', '‡§ú‡§¨', '‡§ú‡§π‡§æ‡§Å', '‡§ú‡§æ', '‡§ú‡§ø‡§∏', '‡§ú‡§ø‡§®‡•ç‡§π‡•á‡§Ç', '‡§ú‡§ø‡§®‡•ç‡§π‡•ã‡§Ç', '‡§ú‡§ø‡§∏‡•á', '‡§ú‡§ø‡§∏‡§ï‡§æ', '‡§ú‡§ø‡§∏‡§ï‡•Ä','‡§ú‡§ø‡§∏‡§ï‡•á', '‡§ú‡§ø‡§∏‡§Æ‡•á‡§Ç', '‡§ú‡§ø‡§ß‡§∞', '‡§ï‡•á', '‡§ï‡§æ', '‡§ï‡•Ä', '‡§ï‡•ã', '‡§ï‡§ø', '‡§á‡§∏', '‡§â‡§∏', '‡§â‡§∏‡•á', '‡§â‡§®', '‡§â‡§®‡•ç‡§π‡•á‡§Ç', '‡§â‡§®‡•ç‡§π‡•ã‡§Ç', '‡§â‡§®‡§ï‡§æ', '‡§â‡§®‡§ï‡•Ä', '‡§â‡§®‡§ï‡•á','‡§â‡§®‡§∏‡•á', '‡§Ö‡§™‡§®‡§æ', '‡§Ö‡§™‡§®‡•Ä', '‡§Ö‡§™‡§®‡•á', '‡§Ü‡§¶‡§ø', '‡§á‡§§‡•ç‡§Ø‡§æ‡§¶‡§ø', '‡§á‡§®‡•ç‡§π‡•á‡§Ç', '‡§á‡§®‡•ç‡§π‡•ã‡§Ç', '‡§á‡§®‡§ï‡§æ', '‡§á‡§®‡§ï‡•Ä', '‡§á‡§®‡§ï‡•á', '‡§á‡§®‡§∏‡•á', '‡§ú‡•à‡§∏‡§æ', '‡§ú‡•à‡§∏‡•á','‡§Ö‡§Ç‡§¶‡§∞', '‡§Ö‡§§', '‡§Ö‡§¶‡§ø', '‡§Ö‡§™', '‡§Ö‡§™‡§®‡§æ', '‡§Ö‡§™‡§®‡§ø', '‡§Ö‡§™‡§®‡•Ä', '‡§Ö‡§™‡§®‡•á', '‡§Ö‡§≠‡§ø', '‡§Ö‡§≠‡•Ä', '‡§Ü‡§¶‡§ø', '‡§Ü‡§™', '‡§á‡§Ç‡§π‡§ø‡§Ç', '‡§á‡§Ç‡§π‡•á‡§Ç', '‡§á‡§Ç‡§π‡•ã‡§Ç', '‡§á‡§§‡§Ø‡§æ‡§¶‡§ø', '‡§á‡§§‡•ç‡§Ø‡§æ‡§¶‡§ø', '‡§á‡§®', '‡§á‡§®‡§ï‡§æ', '‡§á‡§®‡•ç‡§π‡•Ä‡§Ç', '‡§á‡§®‡•ç‡§π‡•á‡§Ç', '‡§á‡§®‡•ç‡§π‡•ã‡§Ç', '‡§á‡§∏', '‡§á‡§∏‡§ï‡§æ', '‡§á‡§∏‡§ï‡§ø', '‡§á‡§∏‡§ï‡•Ä', '‡§á‡§∏‡§ï‡•á', '‡§á‡§∏‡§Æ‡•á‡§Ç', '‡§á‡§∏‡§ø', '‡§á‡§∏‡•Ä', '‡§á‡§∏‡•á', '‡§â‡§Ç‡§π‡§ø‡§Ç', '‡§â‡§Ç‡§π‡•á‡§Ç', '‡§â‡§Ç‡§π‡•ã‡§Ç', '‡§â‡§®', '‡§â‡§®‡§ï‡§æ', '‡§â‡§®‡§ï‡§ø', '‡§â‡§®‡§ï‡•Ä', '‡§â‡§®‡§ï‡•á', '‡§â‡§®‡§ï‡•ã', '‡§â‡§®‡•ç‡§π‡•Ä‡§Ç', '‡§â‡§®‡•ç‡§π‡•á‡§Ç', '‡§â‡§®‡•ç‡§π‡•ã‡§Ç', '‡§â‡§∏', '‡§â‡§∏‡§ï‡•á', '‡§â‡§∏‡§ø', '‡§â‡§∏‡•Ä', '‡§â‡§∏‡•á', '‡§è‡§ï', '‡§è‡§µ‡§Ç', '‡§è‡§∏', '‡§è‡§∏‡•á', '‡§ê‡§∏‡•á', '‡§ì‡§∞', '‡§î‡§∞', '‡§ï‡§á', '‡§ï‡§à', '‡§ï‡§∞', '‡§ï‡§∞‡§§‡§æ', '‡§ï‡§∞‡§§‡•á', '‡§ï‡§∞‡§®‡§æ', '‡§ï‡§∞‡§®‡•á', '‡§ï‡§∞‡•á‡§Ç', '‡§ï‡§π‡§§‡•á', '‡§ï‡§π‡§æ', '‡§ï‡§æ', '‡§ï‡§æ‡§´‡§ø', '‡§ï‡§æ‡•û‡•Ä', '‡§ï‡§ø', '‡§ï‡§ø‡§Ç‡§π‡•á‡§Ç', '‡§ï‡§ø‡§Ç‡§π‡•ã‡§Ç', '‡§ï‡§ø‡§§‡§®‡§æ', '‡§ï‡§ø‡§®‡•ç‡§π‡•á‡§Ç', '‡§ï‡§ø‡§®‡•ç‡§π‡•ã‡§Ç', '‡§ï‡§ø‡§Ø‡§æ', '‡§ï‡§ø‡§∞', '‡§ï‡§ø‡§∏', '‡§ï‡§ø‡§∏‡§ø', '‡§ï‡§ø‡§∏‡•Ä', '‡§ï‡§ø‡§∏‡•á', '‡§ï‡•Ä', '‡§ï‡•Å‡§õ', '‡§ï‡•Å‡§≤', '‡§ï‡•á', '‡§ï‡•ã', '‡§ï‡•ã‡§á', '‡§ï‡•ã‡§à', '‡§ï‡•ã‡§®', '‡§ï‡•ã‡§®‡§∏‡§æ', '‡§ï‡•å‡§®', '‡§ï‡•å‡§®‡§∏‡§æ', '‡§ó‡§Ø‡§æ', '‡§ò‡§∞', '‡§ú‡§¨', '‡§ú‡§π‡§æ‡§Å', '‡§ú‡§π‡§æ‡§Ç', '‡§ú‡§æ', '‡§ú‡§ø‡§Ç‡§π‡•á‡§Ç', '‡§ú‡§ø‡§Ç‡§π‡•ã‡§Ç', '‡§ú‡§ø‡§§‡§®‡§æ', '‡§ú‡§ø‡§ß‡§∞', '‡§ú‡§ø‡§®', '‡§ú‡§ø‡§®‡•ç‡§π‡•á‡§Ç', '‡§ú‡§ø‡§®‡•ç‡§π‡•ã‡§Ç', '‡§ú‡§ø‡§∏', '‡§ú‡§ø‡§∏‡•á', '‡§ú‡•Ä‡§ß‡§∞', '‡§ú‡•á‡§∏‡§æ', '‡§ú‡•á‡§∏‡•á', '‡§ú‡•à‡§∏‡§æ', '‡§ú‡•à‡§∏‡•á' , '‡§§‡•à‡§∏‡§æ', '‡§§‡•à‡§∏‡•á', '‡§á‡§∏‡§≤‡§ø‡§è', '‡§á‡§∏‡§ï‡•á ‡§Ö‡§≤‡§æ‡§µ‡§æ', '‡§´‡§ø‡§∞', '‡§Ö‡§ó‡§∞', '‡§ï‡§ø', '‡§ï‡•Ä', '‡§ï‡•á ‡§¨‡§æ‡§∞‡•á ‡§Æ‡•á‡§Ç', '‡§ï‡§ø‡§∏‡•Ä ‡§§‡§∞‡§π', '‡§ï‡•ã‡§à', '‡§ï‡•Å‡§õ', '‡§ï‡•Å‡§≤','‡§ú‡§ø‡§§‡§®‡§æ', '‡§§‡§ï', '‡§§‡•ã', '‡§•‡•Ä', '‡§•‡•á', '‡§•‡§æ', '‡§®‡•á', '‡§™‡§∞', '‡§ú‡§æ', '‡§ú‡•ã', '‡§∏‡§¨‡§∏‡•á', '‡§∏‡§Ç‡§ó','‡§∏‡•á', '‡§§‡§ï', '‡§∏‡§æ‡§•', '‡§π‡•Ä', '‡§π‡•Å‡§Ü', '‡§π‡•Å‡§à', '‡§π‡•Å‡§è', '‡§π‡•ã‡§§‡§æ', '‡§π‡•ã‡§§‡•Ä', '‡§π']
# remove duplicate stop words
stop_set = set()
for word in stop_words:
  stop_set.add(word)
print("No. of stop words: ", len(stop_set))

No. of stop words:  235


In [5]:
with open('/content/drive/MyDrive/Colab Notebooks/final_stopwords.txt', 'r', encoding='utf8') as file:
    for line in file:
        word = line.strip()  # remove newline character from the end of the line
        stop_set.add(word)  # add the word to the set
print("No. of stop words: ", len(stop_set))

No. of stop words:  422


In [6]:
def remove_stopwords_hindi(text):
    tokens = word_tokenize(text)
    # print(tokens)
    filtered_tokens = [word for word in tokens if word not in stop_set]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def remove_stopwords_english(text):
    tokens = word_tokenize(text)
    # print(tokens)
    filtered_tokens = [word for word in tokens if word not in set(stopwords.words('english'))]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [7]:
# removing punctuations
def remove_punctuations(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

In [8]:
# # tokenize and check unique words
def tokenize_unique_save(col):
  unique = set()
  for cell in col:
    tokens = word_tokenize(cell)
    for token in tokens:
      unique.add(token)
  return unique

# unique_list = list()
# with open('/content/drive/MyDrive/Colab Notebooks/unique.txt', 'w') as file:
#   for word in tokenize_unique_save(df['text']):
#     file.write(str(word)+'\n')
#   file.close()

# print(tokenize_unique_save(df['text']))

In [9]:
!pip install indic_transliteration emot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting indic_transliteration
  Downloading indic_transliteration-2.3.44-py3-none-any.whl (143 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m143.3/143.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.5/61.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting backports.functools-lru-cache
  Downloading backports.functools_lru_cache-1.6.4-py2.py3-none-any.whl (5.9 kB)
Collecting roman
  Downloading roman-4.0-py3-none-any.whl (7.8 kB)
Installing collected packages: emot, roman, backports.functools-lru-cache, indic_transliteration
Successfully installed backports.functo

In [10]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Define preprocessing functions
def preprocess_hindi_text(text):
    # Remove unnecessary symbols
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    return text

def transliterate_hindi(text):
    # Transliterate Hindi text to English
    english_text = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
    return english_text.lower()

import unicodedata

def extract_emojis(text):
    # Extract emojis from text
    emojis = ''.join(c for c in text if c in ''.join(chr(i) for i in range(0x1F300, 0x1F6FF+1)))
    return emojis

def extract_raw_english(text):
    # Remove unnecessary symbols
    text = re.sub(r'[^a-z^A-Z]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    return text

In [11]:
from emot.emo_unicode import UNICODE_EMOJI
 # Function for converting emojis into word
def convert_emojis(text):
    for e in UNICODE_EMOJI:
        text = text.replace(e, (UNICODE_EMOJI[e]+' ').replace(':',''))
    return text

convert_emojis('üòçüòéüë∏üëàüé†üë´üë∏üëàüëóüíÉüëÄüë©üòãüòçüëàüë∏üë∞ ‡§™‡§ó‡§≤‡•Ä ‡§¨‡§π‡•ã‡§§ ‡§∂‡•ã‡§ï‡•Ä‡§® ‡§§‡•Å‡§ú‡•á ‡§°‡•ã‡§≤‡•Ä ‡§¨‡§ø‡§†‡§æ‡§ï‡•á ‡§≤‡•á‡§ï‡•á ‡§§‡•á‡§∞‡•á ‡§ï‡§™‡•ú...')

'smiling_face_with_heart-eyes smiling_face_with_sunglasses princess backhand_index_pointing_left carousel_horse woman_and_man_holding_hands princess backhand_index_pointing_left dress woman_dancing eyes woman face_savoring_food smiling_face_with_heart-eyes backhand_index_pointing_left princess person_with_veil  ‡§™‡§ó‡§≤‡•Ä ‡§¨‡§π‡•ã‡§§ ‡§∂‡•ã‡§ï‡•Ä‡§® ‡§§‡•Å‡§ú‡•á ‡§°‡•ã‡§≤‡•Ä ‡§¨‡§ø‡§†‡§æ‡§ï‡•á ‡§≤‡•á‡§ï‡•á ‡§§‡•á‡§∞‡•á ‡§ï‡§™‡•ú...'

In [12]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/abusive_text.csv')
df.head()

Unnamed: 0,label,text
0,0,‡§≠‡•Ä‡§°‡§º ‡§Æ‡•á‡§Ç ‡§¨‡§π‡•Å‡§§ ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á
1,0,‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Ö‡§™‡§®‡•Ä ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç ‡§§‡•ã ‡§π‡§ü‡§æ ‡§¶‡•á‡§Ç
2,0,‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§§‡•ã ‡§Æ‡•Å‡§π ‡§Æ‡•á‡§Ç ‡§≤‡§Ç‡§° ‡§≤‡•á ‡§≤‡•ã ‡§§‡•ã
3,0,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§π‡•à ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ ‡§§‡•Ç ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§∞‡§π‡§æ ‡§π‡•à
4,1,‡§ö‡§æ‡§Ø ‡§®‡§π‡•Ä‡§Ç ‡§™‡•Ä‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§Æ‡•à‡§Ç ‡§á‡§∏‡•Ä ‡§ï‡•ã ‡§õ‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ ok


In [13]:
############################################################################# START PRE_PROCESSING ###################################################################################

# remove punctuation from text
df['Clean Text'] = df['text'].apply(remove_punctuations)

# convert emoji to text
df['Clean Text'] = df['Clean Text'].apply(convert_emojis)

# remove digits from text
df['Clean Text'] = df['Clean Text'].str.replace('\d+',' ')

# remove stop words from hindi
df['Clean Text'] = df['Clean Text'].apply(remove_stopwords_hindi)


df['final_text'] = df['Clean Text']

df.head(10)

  df['Clean Text'] = df['Clean Text'].str.replace('\d+',' ')


Unnamed: 0,label,text,Clean Text,final_text
0,0,‡§≠‡•Ä‡§°‡§º ‡§Æ‡•á‡§Ç ‡§¨‡§π‡•Å‡§§ ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á,‡§≠‡•Ä‡§°‡§º ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á,‡§≠‡•Ä‡§°‡§º ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á
1,0,‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Ö‡§™‡§®‡•Ä ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç ‡§§‡•ã ‡§π‡§ü‡§æ ‡§¶‡•á‡§Ç,‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç ‡§π‡§ü‡§æ,‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç ‡§π‡§ü‡§æ
2,0,‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§§‡•ã ‡§Æ‡•Å‡§π ‡§Æ‡•á‡§Ç ‡§≤‡§Ç‡§° ‡§≤‡•á ‡§≤‡•ã ‡§§‡•ã,‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§Æ‡•Å‡§π ‡§≤‡§Ç‡§°,‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§Æ‡•Å‡§π ‡§≤‡§Ç‡§°
3,0,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§π‡•à ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ ‡§§‡•Ç ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§∞‡§π‡§æ ‡§π‡•à,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ
4,1,‡§ö‡§æ‡§Ø ‡§®‡§π‡•Ä‡§Ç ‡§™‡•Ä‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§Æ‡•à‡§Ç ‡§á‡§∏‡•Ä ‡§ï‡•ã ‡§õ‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ ok,‡§ö‡§æ‡§Ø ‡§™‡•Ä‡§§‡§æ ‡§õ‡•ã‡§°‡§º ok,‡§ö‡§æ‡§Ø ‡§™‡•Ä‡§§‡§æ ‡§õ‡•ã‡§°‡§º ok
5,0,"‡§¨‡•à‡§Ç‡§ï‡•ã‡§Ç ‡§∏‡•á ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§ï‡§æ ‡§∞‡§ø‡§∂‡•ç‡§§‡§æ ‡§¶‡§≤‡§æ‡§≤‡•Ä ‡§ï‡§æ ‡§π‡•à, ‡§¨‡§°‡§º‡•á ‡§¨‡§°...",‡§¨‡•à‡§Ç‡§ï‡•ã‡§Ç ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§∞‡§ø‡§∂‡•ç‡§§‡§æ ‡§¶‡§≤‡§æ‡§≤‡•Ä ‡§ö‡•ã‡§∞‡•ã‡§Ç ‡§≤‡•ã‡§® ‡§¶‡§ø‡§≤‡§æ‡§®‡•á ‡§è‡§µ‡§ú...,‡§¨‡•à‡§Ç‡§ï‡•ã‡§Ç ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§∞‡§ø‡§∂‡•ç‡§§‡§æ ‡§¶‡§≤‡§æ‡§≤‡•Ä ‡§ö‡•ã‡§∞‡•ã‡§Ç ‡§≤‡•ã‡§® ‡§¶‡§ø‡§≤‡§æ‡§®‡•á ‡§è‡§µ‡§ú...
6,0,#‡§™‡§ó‡§≤‡•Ä üòç ‡§π‡§Æ ‡§§‡•ã #‡§¨‡§π‡•ã‡§§ üòé _‡§∂‡•ã‡§ï‡•Ä‡§® ‡§π‡•à‡§Ç üë∏üëà #‡§§‡•Å‡§ú‡•á ‡§°‡•ã‡§≤‡•Ä...,‡§™‡§ó‡§≤‡•Ä smiling_face_with_heart-eyes ‡§¨‡§π‡•ã‡§§ smiling...,‡§™‡§ó‡§≤‡•Ä smiling_face_with_heart-eyes ‡§¨‡§π‡•ã‡§§ smiling...
7,0,ohh my god . ‡§Ø‡§π‡§æ‡§Ç ‡§á‡§§‡§®‡•á ‡§∏‡§æ‡§∞‡•á ‡§ö‡§Æ‡§ö‡•á ‡§á‡§∏‡§ï‡§æ ‡§Æ‡§§‡§≤‡§¨ ‡§ú‡§≤‡§®...,ohh my god ‡§á‡§§‡§®‡•á ‡§ö‡§Æ‡§ö‡•á ‡§Æ‡§§‡§≤‡§¨ ‡§ú‡§≤‡§®‡§ñ‡•ã‡§∞‡•ã ‡§ï‡§Æ‡•Ä ‡§®‡§π‡•Ä,ohh my god ‡§á‡§§‡§®‡•á ‡§ö‡§Æ‡§ö‡•á ‡§Æ‡§§‡§≤‡§¨ ‡§ú‡§≤‡§®‡§ñ‡•ã‡§∞‡•ã ‡§ï‡§Æ‡•Ä ‡§®‡§π‡•Ä
8,1,‡§Ü‡§™ ‡§Æ‡•á‡§∞‡•á ‡§ï‡§Æ‡•á‡§Ç‡§ü ‡§ï‡§æ ‡§á‡§§‡§®‡§æ ‡§á‡§Ç‡§§‡§ú‡§æ‡§∞ ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§ï‡§∞‡§§‡•á ‡§π‡•à,‡§Æ‡•á‡§∞‡•á ‡§ï‡§Æ‡•á‡§Ç‡§ü ‡§á‡§§‡§®‡§æ ‡§á‡§Ç‡§§‡§ú‡§æ‡§∞,‡§Æ‡•á‡§∞‡•á ‡§ï‡§Æ‡•á‡§Ç‡§ü ‡§á‡§§‡§®‡§æ ‡§á‡§Ç‡§§‡§ú‡§æ‡§∞
9,0,‡§≠‡•ã‡§∏‡§°‡§º‡•Ä ‡§ï‡•ã ‡§¨‡§æ‡§§ ‡§ï‡§∞‡§®‡•á ‡§¶‡•ã,‡§≠‡•ã‡§∏‡§°‡§º‡•Ä ‡§¨‡§æ‡§§,‡§≠‡•ã‡§∏‡§°‡§º‡•Ä ‡§¨‡§æ‡§§


In [14]:
# Split dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2)

# Convert text data to TF-IDF vectors
vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf=True)

train_X = vectorizer.fit_transform(train_df['final_text'])
val_X = vectorizer.transform(val_df['final_text'])

# Convert labels to NumPy arrays
train_y = np.array(train_df['label'])
val_y = np.array(val_df['label'])

# Train KNN model on the training set
k = 18 # number of neighbors
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(train_X, train_y)

# Evaluate KNN model on the validation set
print(classification_report(val_y, knn.predict(val_X)))

              precision    recall  f1-score   support

           0       0.61      0.86      0.71      2117
           1       0.71      0.38      0.50      1920

    accuracy                           0.63      4037
   macro avg       0.66      0.62      0.60      4037
weighted avg       0.66      0.63      0.61      4037



In [18]:
import pickle
knnPickle = open('/content/drive/MyDrive/Colab Notebooks/models/knnpickle_file', 'wb') 
      
# source, destination 
pickle.dump(knn, knnPickle)  

# close the file

knnPickle.close()

In [22]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hindi_test.csv')

In [23]:

# preprocess data
# remove punctuation from text
test_df['Clean Text'] = test_df['text'].apply(remove_punctuations)

# convert emoji to text
test_df['Clean Text'] = test_df['Clean Text'].apply(convert_emojis)

# remove digits from text
test_df['Clean Text'] = test_df['Clean Text'].str.replace('\d+',' ')

# remove stop words from hindi
test_df['Clean Text'] = test_df['Clean Text'].apply(remove_stopwords_hindi)

test_df['final_text'] = test_df['Clean Text']

test_df.head(10)

  test_df['Clean Text'] = test_df['Clean Text'].str.replace('\d+',' ')


Unnamed: 0,label,text,Clean Text,final_text
0,0,‡§Æ‡•à‡§Ç ‡§Ø‡•á ‡§®‡§π‡•Ä ‡§∏‡•ã‡§ö ‡§∞‡§π‡§æ ‡§ï‡•Ä ‡§á‡§∏‡•á ‡§®‡§ø‡§ï‡§≤‡•á ‡§ï‡•à‡§∏‡•á ‡§Æ‡•à‡§Ç ‡§Ø‡•á ‡§∏‡•ã...,‡§®‡§π‡•Ä ‡§∏‡•ã‡§ö ‡§®‡§ø‡§ï‡§≤‡•á ‡§∏‡•ã‡§ö ‡§´‡§Ç‡§∏‡§æ ‡§ï‡•à‡§∏‡•áface_with_tears_of_...,‡§®‡§π‡•Ä ‡§∏‡•ã‡§ö ‡§®‡§ø‡§ï‡§≤‡•á ‡§∏‡•ã‡§ö ‡§´‡§Ç‡§∏‡§æ ‡§ï‡•à‡§∏‡•áface_with_tears_of_...
1,1,‡§î‡§∞ ‡§¶‡§ø‡§µ‡§æ‡§≤‡•Ä ‡§Æ‡•á‡§Ç ‡§≠‡•Ä ‡§™‡•Ç‡§∞‡§æ ‡§¶‡•á‡§∂ ‡§™‡§°‡§º‡§æ‡§ï‡§æ ‡§®‡§π‡•Ä‡§Ç ‡§´‡•ã‡§°‡§æ‡§§,‡§¶‡§ø‡§µ‡§æ‡§≤‡•Ä ‡§¶‡•á‡§∂ ‡§™‡§°‡§º‡§æ‡§ï‡§æ ‡§´‡•ã‡§°‡§æ‡§§,‡§¶‡§ø‡§µ‡§æ‡§≤‡•Ä ‡§¶‡•á‡§∂ ‡§™‡§°‡§º‡§æ‡§ï‡§æ ‡§´‡•ã‡§°‡§æ‡§§
2,1,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ä ‡§™‡§æ‡§≤ ‡§≤‡•á‡§®‡§æ ‡§Æ‡§ó‡§∞ ‡§ó‡§≤‡§§ ‡§´‡§π‡§Æ‡•Ä ‡§ï‡§≠‡•Ä ‡§®‡§π‡•Ä‡§Ç‡•§,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ä ‡§™‡§æ‡§≤ ‡§≤‡•á‡§®‡§æ ‡§ó‡§≤‡§§ ‡§´‡§π‡§Æ‡•Ä ‡§®‡§π‡•Ä‡§Ç‡•§,‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ä ‡§™‡§æ‡§≤ ‡§≤‡•á‡§®‡§æ ‡§ó‡§≤‡§§ ‡§´‡§π‡§Æ‡•Ä ‡§®‡§π‡•Ä‡§Ç‡•§
3,0,‡§§‡•á‡§∞‡•Ä ‡§ó‡§æ‡§Ç‡§° ‡§Æ‡•á‡§Ç ‡§™‡•ç‡§Ø‡§æ‡§ú ‡§ï‡§æ‡§ü ‡§¶‡•á‡§ó‡§æ ‡§ó‡•Å‡§ú‡•ç‡§ú‡§∞ ‡§≠‡•ã‡§∏‡§°‡§º‡•Ä ‡§ï‡•á,‡§§‡•á‡§∞‡•Ä ‡§ó‡§æ‡§Ç‡§° ‡§™‡•ç‡§Ø‡§æ‡§ú ‡§ï‡§æ‡§ü ‡§¶‡•á‡§ó‡§æ ‡§ó‡•Å‡§ú‡•ç‡§ú‡§∞ ‡§≠‡•ã‡§∏‡§°‡§º‡•Ä,‡§§‡•á‡§∞‡•Ä ‡§ó‡§æ‡§Ç‡§° ‡§™‡•ç‡§Ø‡§æ‡§ú ‡§ï‡§æ‡§ü ‡§¶‡•á‡§ó‡§æ ‡§ó‡•Å‡§ú‡•ç‡§ú‡§∞ ‡§≠‡•ã‡§∏‡§°‡§º‡•Ä
4,1,‡§¨‡§Ç‡§ó‡§æ‡§≤‡•Ä ‡§∏‡§æ‡§°‡§º‡•Ä ‡§ê‡§∏‡•á ‡§®‡§π‡•Ä‡§Ç ‡§™‡§π‡§®‡§æ ‡§ú‡§æ‡§§‡§æ ‡§π‡•à ‡§¶‡•Ä‡§¶‡•Ä,‡§¨‡§Ç‡§ó‡§æ‡§≤‡•Ä ‡§∏‡§æ‡§°‡§º‡•Ä ‡§™‡§π‡§®‡§æ ‡§¶‡•Ä‡§¶‡•Ä,‡§¨‡§Ç‡§ó‡§æ‡§≤‡•Ä ‡§∏‡§æ‡§°‡§º‡•Ä ‡§™‡§π‡§®‡§æ ‡§¶‡•Ä‡§¶‡•Ä
5,1,‡§ê ‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ ‡§π‡•à ‡§Ø‡§π ‡§Ü‡§¶‡§Æ‡•Ä ‡§¶‡•ã ‡§¨‡§æ‡§∞ ‡§ú‡•Ä‡§§‡§æ ‡§π‡•à ‡§è‡§ï ‡§¨‡§æ‡§∞ ‡§Æ‡§∞‡§§...,‡§ê ‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ ‡§Ü‡§¶‡§Æ‡•Ä ‡§ú‡•Ä‡§§‡§æ ‡§Æ‡§∞‡§§‡§æ,‡§ê ‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ ‡§Ü‡§¶‡§Æ‡•Ä ‡§ú‡•Ä‡§§‡§æ ‡§Æ‡§∞‡§§‡§æ
6,1,‡§Ö‡§ï‡•ç‡§ï‡§°‡§º ‡§¨‡§ï‡•ç‡§ï‡§°‡§º ‡§¨‡§Ç‡§¨‡•á ‡§¨‡•ã ‡§°‡•Ä‡§ú‡§≤ ‡§®‡§¨‡•ç‡§¨‡•á ‡§™‡•á‡§ü‡•ç‡§∞‡•ã‡§≤ ‡§∏‡•å ...,‡§Ö‡§ï‡•ç‡§ï‡§°‡§º ‡§¨‡§ï‡•ç‡§ï‡§°‡§º ‡§¨‡§Ç‡§¨‡•á ‡§¨‡•ã ‡§°‡•Ä‡§ú‡§≤ ‡§®‡§¨‡•ç‡§¨‡•á ‡§™‡•á‡§ü‡•ç‡§∞‡•ã‡§≤ ‡§∏‡•å ‡§∏‡•å...,‡§Ö‡§ï‡•ç‡§ï‡§°‡§º ‡§¨‡§ï‡•ç‡§ï‡§°‡§º ‡§¨‡§Ç‡§¨‡•á ‡§¨‡•ã ‡§°‡•Ä‡§ú‡§≤ ‡§®‡§¨‡•ç‡§¨‡•á ‡§™‡•á‡§ü‡•ç‡§∞‡•ã‡§≤ ‡§∏‡•å ‡§∏‡•å...
7,1,‡§è‡§ï ‡§§‡•Ä‡§∞ ‡§è‡§ï ‡§ï‡§Æ‡§æ‡§® ‡§Ü‡§¶‡§ø‡§µ‡§æ‡§∏‡•Ä ‡§è‡§ï ‡§∏‡§Æ‡§æ‡§® ‡§è‡§ï ‡§§‡•Ä‡§∞ ‡§è‡§ï ‡§ï‡§Æ‡§æ‡§® ...,‡§§‡•Ä‡§∞ ‡§ï‡§Æ‡§æ‡§® ‡§Ü‡§¶‡§ø‡§µ‡§æ‡§∏‡•Ä ‡§§‡•Ä‡§∞ ‡§ï‡§Æ‡§æ‡§® ‡§ú‡§Ø ‡§∂‡•ç‡§∞‡•Ä‡§∞‡§æ‡§Æ ‡§ú‡§Ø ‡§∂‡•ç‡§∞‡•Ä‡§∞‡§æ...,‡§§‡•Ä‡§∞ ‡§ï‡§Æ‡§æ‡§® ‡§Ü‡§¶‡§ø‡§µ‡§æ‡§∏‡•Ä ‡§§‡•Ä‡§∞ ‡§ï‡§Æ‡§æ‡§® ‡§ú‡§Ø ‡§∂‡•ç‡§∞‡•Ä‡§∞‡§æ‡§Æ ‡§ú‡§Ø ‡§∂‡•ç‡§∞‡•Ä‡§∞‡§æ...
8,1,‡§Ü‡§™‡§ï‡§æ ‡§¨‡§π‡•Å‡§§ ‡§¨‡§°‡§º‡§æ ‡§´‡•à‡§® ‡§π‡•Ç‡§Ç ‡§Ö‡§∏‡§¶ ‡§ì‡§µ‡•à‡§∏‡•Ä ‡§∏‡§æ‡§π‡§¨ ‡§Æ‡•à‡§Ç ‡§Ü‡§™‡§ï‡§æ...,‡§´‡•à‡§® ‡§Ö‡§∏‡§¶ ‡§ì‡§µ‡•à‡§∏‡•Ä ‡§∏‡§æ‡§π‡§¨ ‡§´‡•à‡§® ‡§Æ‡•Å‡§ú‡§Æ‡•ç‡§Æ‡§ø‡§≤ ‡§•‡§æ‡§≤‡•Ä number,‡§´‡•à‡§® ‡§Ö‡§∏‡§¶ ‡§ì‡§µ‡•à‡§∏‡•Ä ‡§∏‡§æ‡§π‡§¨ ‡§´‡•à‡§® ‡§Æ‡•Å‡§ú‡§Æ‡•ç‡§Æ‡§ø‡§≤ ‡§•‡§æ‡§≤‡•Ä number
9,0,‡§§‡•Å‡§Æ ‡§∏‡§¨ ‡§ö‡•Ç‡§§‡§ø‡§Ø‡§æ ‡§π‡•ã ‡§∞‡•ã‡§ü‡•Ä ‡§∞‡§æ‡§Æ,‡§∏‡§¨ ‡§ö‡•Ç‡§§‡§ø‡§Ø‡§æ ‡§∞‡•ã‡§ü‡•Ä ‡§∞‡§æ‡§Æ,‡§∏‡§¨ ‡§ö‡•Ç‡§§‡§ø‡§Ø‡§æ ‡§∞‡•ã‡§ü‡•Ä ‡§∞‡§æ‡§Æ


In [24]:
import pickle
# KNN
# Data preparation
def testKNN(testdf):
  # Load the saved model from .pkl file
  with open('/content/drive/MyDrive/Colab Notebooks/models/knnpickle_file', 'rb') as file:
      knn_model = pickle.load(file)

  # vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf=True)
  test_X = vectorizer.transform(testdf['final_text'])

  # Convert labels to NumPy arrays
  test_y = np.array(testdf['label'])
  # Make predictions on test dataset using the loaded model
  y_pred = knn_model.predict(test_X)

  # Evaluate KNN model on the validation set
  print(classification_report(test_y, y_pred))  

testKNN(test_df)

              precision    recall  f1-score   support

           0       0.61      0.87      0.72      3496
           1       0.74      0.40      0.52      3232

    accuracy                           0.64      6728
   macro avg       0.68      0.63      0.62      6728
weighted avg       0.67      0.64      0.62      6728



In [20]:
def print_intersection_points(train_df, val_df, new_testdf):
  # Extract the column of interest from training and test dataframes
  train_col = set(train_df["text"])
  test_col = set(new_testdf["text"])

  # Count the number of common rows between the two dataframes
  common_rows = len(train_col.intersection(test_col))
  print("Number of common rows between train and test:", common_rows)

  # Extract the column of interest from validation and test dataframes
  valid_col = set(val_df["text"])

  # Count the number of common rows between the two dataframes
  common_rows = len(valid_col.intersection(test_col))
  print("Number of common rows between validation and test:", common_rows)

In [21]:
print_intersection_points(train_df, val_df, test_df)

Number of common rows between train and test: 0
Number of common rows between validation and test: 0
