<a href="https://colab.research.google.com/github/tejaschaudhari2811/FIRE_Project_Task_1/blob/main/FIRE_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1 : Message Level Sentiment Analysis for Tamil Language

In [None]:
import pandas as pd
import re




In [None]:
print("Upload Training Data \n")
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_table("tamil_offensive_train.tsv")

In [None]:
# Define Polarities for numeric classification
polarities={'NOT':1, "OFF":-1,"not-Tamil":0}

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5880 non-null   object
 1   text      5880 non-null   object
 2   category  5880 non-null   object
dtypes: object(3)
memory usage: 137.9+ KB
None


In [None]:
df.groupby(by=['category']).count()

Unnamed: 0_level_0,id,text
category,Unnamed: 1_level_1,Unnamed: 2_level_1
NOT,4724,4724
OFF,1153,1153
not-Tamil,3,3


In [None]:
# Supplementary Functions
def convert_polarity(category):
    """
    Takes in category as string 
    outputs integer representation
    """
    return polarities[category]

def word_length(sentence):
  """
  Takes in Tamil Text
  Counts Words
  Returns number of Words
  """
    return len(sentence.split(" "))

In [None]:
df['polarity'] = df['category'].apply(convert_polarity)
df['text_length'] = df['text'].apply(len)
df['number_of_words'] = df['text'].apply(word_length)

In [None]:
df.head()

Unnamed: 0,id,text,category,polarity,text_length,number_of_words
0,tam1,திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற...,NOT,1,60,8
1,tam2,இந்த ட்ரெய்லர் கூட பார்க்கிற மாதிரி இல்லை.. இத...,OFF,-1,80,12
2,tam3,மைசூரு செட்டியார் சமூகத்தின் சார்பாக இப்படம் வ...,NOT,1,69,7
3,tam4,மொத்த சாதியும் ஒரு சாதிக்கு எதிரா நிக்குது.......,OFF,-1,84,13
4,tam5,only for விஜய் சேதுபதி and STR,NOT,1,30,6


In [None]:
df = df.drop(df[df.polarity == 0].index)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
tfidf = TfidfVectorizer(max_features=1000)

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()



In [None]:
# Get the data from Dataframe.
X = df['text']
y = df['polarity']

# Convert data to vectors
X = tfidf.fit_transform(X)
X_ros,y_ros = ros.fit_resample(X,y)



In [None]:
def svm_classifier(X_ros,y_ros):
  """
  Takes in oversampled data
  Outputs support vector machine model and classification report
  """
  X_train,X_test,y_train,y_test = train_test_split(X_ros,y_ros,test_size=0.2, random_state=0)
  clf = LinearSVC()
  linear_svc_model = clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  report = classification_report(y_pred,y_test)
  return linear_svc_model,report


In [None]:
model,report= svm_classifier(X_ros,y_ros)

In [None]:
print(report)

              precision    recall  f1-score   support

          -1       0.85      0.78      0.81      1041
           1       0.75      0.83      0.79       849

    accuracy                           0.80      1890
   macro avg       0.80      0.81      0.80      1890
weighted avg       0.81      0.80      0.80      1890



# Using Logistic Regression for Classification

In [None]:
def logistic_regression_classifier(X_ros,y_ros):
  """
  Takes in oversampled data
  Outputs Linear Regression Model and classification report
  """
  X_train,X_test,y_train,y_test = train_test_split(X_ros,y_ros,test_size=0.2, random_state=0)
  clf = LogisticRegression(random_state=0)
  logistic_regression_model = clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  report = classification_report(y_pred,y_test)
  return logistic_regression_model,report

In [None]:
lr_model,report = logistic_regression_classifier(X_ros,y_ros)
print(report)

              precision    recall  f1-score   support

          -1       0.81      0.78      0.80       995
           1       0.77      0.80      0.78       895

    accuracy                           0.79      1890
   macro avg       0.79      0.79      0.79      1890
weighted avg       0.79      0.79      0.79      1890



# Use of Indic-nlp-library for Text Processing

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_resources.git"

In [None]:
!pip install indic-nlp-library

In [None]:
INDIC_NLP_RESOURCES=r"/content/indic_nlp_resources"

In [None]:
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

In [None]:
from indicnlp import loader
loader.load()

In [None]:
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator, ItransTransliterator
string = "திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற வாழ்த்துக்கள்"
sentences = sentence_tokenize.sentence_split(string, lang='ta')
for se in indic_tokenize.trivial_tokenize(string):
  print(ItransTransliterator.to_itrans(se,'en'))

திருமலை
நாயக்கர்
பேரவை
சார்பாக
படம்
வெற்றி
பெற
வாழ்த்துக்கள்


In [None]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

#input_text='राजस्थान'
input_text='திருமலை'
lang='ta'

print(ItransTransliterator.to_itrans(input_text,lang))

tirumalai


In [None]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = TFBertModel.from_pretrained("bert-base-multilingual-uncased")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1715180.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=999358484.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
text = "திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற வாழ்த்துக்கள்"
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

In [None]:
X_hf = df['text']
y_hf = df['polarity']

X_hf = X_hf.apply(tokenizer)

In [None]:
X_hf[0]

{'input_ids': [101, 54552, 60211, 25088, 808, 37025, 20798, 13047, 810, 55541, 20841, 802, 15241, 85286, 12146, 810, 44042, 818, 96902, 49815, 810, 96902, 36718, 818, 13744, 40806, 95588, 12076, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
import numpy as np

In [None]:
# Remove Unnecessary keys and values from the embeddings
X_train = []
for text in X_hf:
  text.pop('token_type_ids')
  text.pop('attention_mask')
  X_train.append(np.array(text['input_ids']))

print("Finished")

Finished


In [None]:
X_ros,y_ros = ros.fit_resample(X_train,y_hf)

  return array(a, dtype, copy=False, order=order)


ValueError: ignored

# Get Emoji Embeddings

In [1]:
!pip install gensim



In [2]:
!git clone "https://github.com/uclnlp/emoji2vec.git"

Cloning into 'emoji2vec'...
remote: Enumerating objects: 62, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 62 (delta 0), reused 1 (delta 0), pack-reused 59[K
Unpacking objects: 100% (62/62), done.


In [3]:
import gensim.models as gsm

e2v = gsm.KeyedVectors.load_word2vec_format('/content/emoji2vec/pre-trained/emoji2vec.bin', binary=True)
happy_vector = e2v['😂']    # Produces an embedding vector of length 300

In [4]:
print(happy_vector)

[ 0.00368995 -0.00747812  0.04797658  0.0592004  -0.0594517  -0.01919357
  0.06628808 -0.06752758  0.10211562  0.087887   -0.02949324 -0.07718539
 -0.07068242  0.04688684 -0.07970048  0.09425059  0.06230384  0.09221437
 -0.03019106 -0.08475313  0.06153254  0.08948752  0.07492744 -0.0697182
  0.02842957 -0.07775956 -0.00585377 -0.01818646  0.07633407 -0.09874155
 -0.05507204  0.04539224  0.00842756 -0.06562244 -0.04902945 -0.05832005
  0.07070394  0.05070673  0.05466724  0.08123799  0.06764334 -0.09264724
  0.07862549 -0.03581576  0.02264971 -0.00622183 -0.0318586   0.0421854
 -0.05610641  0.07420981 -0.06671992 -0.05842454  0.03449175  0.07620545
  0.08768762  0.03041447 -0.05500258 -0.03190211  0.07291865 -0.04514114
 -0.0541351   0.05987531 -0.0346354  -0.01317827  0.00849659 -0.01372368
 -0.01279207 -0.01283779 -0.07702036  0.09149366  0.0390107   0.07108027
 -0.01238731  0.08397982 -0.08521689 -0.06615324  0.02603729  0.07916152
  0.08146766  0.09780202 -0.05352468  0.03703088  0.0

# Dataset Expansion using INDIC NLP Romanization Module

In [None]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

input_text='திருமலை'
lang='ta'

print(ItransTransliterator.to_itrans(input_text,lang))

tirumalai


In [None]:
English_Tamil_comments = []
id = 0
id_list = []

for text in df['text']:
  id_list.append(str("eng{}".format(id)))
  English_Tamil_comments.append((ItransTransliterator.to_itrans(text,'ta')))
  id +=1

In [None]:
dict_ = {'id':id_list,'text':English_Tamil_comments, 'category':df['category']}

In [None]:
new_df = pd.DataFrame.from_dict(dict_)
df = df.append(new_df,ignore_index=True)

# Add English Translated Comments to Data

In [None]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading https://files.pythonhosted.org/packages/19/3d/4e3a1609bf52f2f7b00436cc751eb977e27040665dde2bd57e7152989672/googletrans-3.1.0a0.tar.gz
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
  Created wheel for googletrans: filename=googletrans-3.1.0a0-cp37-none-any.whl size=16368 sha256=f6f1565873f6a96c75c3e22511518e5260a0ccbb2317aeb5476c26f92535221d
  Stored in directory: /root/.cache/pip/wheels/27/7a/a0/aff3babbb775549ce6813cb8fa7ff3c0848c4dc62c20f8fdac
Successfully built googletrans
Installing collected packages: googletrans
  Found existing installation: googletrans 4.0.0rc1
    Uninstalling googletrans-4.0.0rc1:
      Successfully uninstalled googletrans-4.0.0rc1
Successfully installed googletrans-3.1.0a0


In [None]:
!pip uninstall googletrans

Uninstalling googletrans-3.1.0a0:
  Would remove:
    /usr/local/bin/translate
    /usr/local/lib/python3.7/dist-packages/googletrans-3.1.0a0.dist-info/*
    /usr/local/lib/python3.7/dist-packages/googletrans/*
Proceed (y/n)? 

In [None]:
from googletrans import Translator, constants
translator = Translator()

In [None]:
translation = translator.translate("திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற வாழ்த்துக்கள்")
print(translation.text)

திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற வாழ்த்துக்கள்


In [None]:
Tamil_translated_comments = []
id = 0
id_list = []

for text in df['text']:
  id_list.append(str("trans_{}".format(id)))
  id +=1
  if id < 2370:
    continue
  translator.raise_Exception = True
  translation = translator.translate(text)
  print(id)
  Tamil_translated_comments.append(translation.text)
  
  
print(Tamil_translated_comments)

In [None]:
import pickle
with open("tamil_english_translated_2.pkl","wb") as f:
  pickle.dump(Tamil_translated_comments,f)

In [None]:
import pickle
with open("/content/tamil_english_translated.pkl","rb") as f:
  list_1 = pickle.load(f)

In [None]:
all_comments = list_1+Tamil_translated_comments

In [None]:
print(len(df['text']))

5880


In [None]:
print(len(all_comments))

5881


In [None]:
print(all_comments[5880])

உதயநிதிக்கு நடிப்பு வரலனு சொல்ரவங்க லைக் போடுங்கள் ...


In [None]:
#dict_ = {'id':id_list,'text':Tamil_translated_comments, 'category':df['category']}
dict_ = {text':Tamil_translated_comments}

SyntaxError: ignored

In [None]:
print(Tamil_translated_comments)

['Congratulations to win the film on behalf of Thirumalai Nayakar', 'This trailer does not even see the model .. this is the theater go and see ..', 'I congratulate this film on behalf of Mysore Chettiar community', 'Total caste is a nickname for a caste ...', 'Only for Vijay Sethupathi and str', 'This is why Psycho Picture is Burly', 'Mutha and the caste paint will now come to Mohan and the caste dye', 'Success to the film on behalf of all people who speak in Tamil Nadu', 'We are important for both of us to cut the hand of the yaroo', 'The picture is more realistic than that', 'This movie is dominated by the Tamil cinema, getting the great success,', 'We were Mohan Anna, anxiety', 'Congratulations on behalf of the people in Tamil Nadu ... Papa according to the caste.You have a lot of offers (for the flow)) Puba.', 'I looked at 105 times.Kilker, Beautiful ...', 'If the society is society, let the poison in the blowing air.', 'Brother ... You are everything YouTube-Gaya ??Despite any Vi

In [None]:
# Start translation again from 2370

In [None]:
new_df = pd.DataFrame.from_dict(dict_)
#df = df.append(new_df,ignore_index=True)

ValueError: ignored

# Create Embeddings Dictionary
### 1. Find all unique emojis in the dataset
### 2. Get embeddings for the emojis
### 3. Convert emojis to Embeddings and replace in senteces after embeddings.

In [5]:
!pip install emojis

Collecting emojis
  Downloading https://files.pythonhosted.org/packages/2e/94/61025e53488acd95b49862ec854e05b036f92fe9d0e512ca551a5a8b03d6/emojis-0.6.0-py3-none-any.whl
Installing collected packages: emojis
Successfully installed emojis-0.6.0


In [6]:
# Check emojis in each sentence, if present, extract out.
import emojis

emoji_list = []

for text in df['text']:

  set_ = emojis.get(text)
  
  if len(set_) != 0:
    emoji_list.append(list(set_))

NameError: ignored

In [None]:
# Create list of Unique Emojis
unique_emojis = []

for emoji in emoji_list:
  
  for emo in emoji:
    
    if emo not in unique_emojis:
      unique_emojis.append(emo)

In [None]:
for emoji in unique_emojis:
  try:
    (e2v[emoji])
  except:
    KeyError
    print(emoji)

In [None]:
#@title
# As a large amount of Emojis are not present in th pretrained emoji2vec data. Need to Find new Emoji Embeddings.

# BERT_Multilingual_Embeddings

In [None]:
!pip install transformers



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel

In [None]:
from sklearn.model_selection import train_test_split

import pandas as pd

In [None]:
df = pd.read_table("tamil_offensive_train.tsv")

train_df, test_df = train_test_split(df,test_size=0.2,random_state=32)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [None]:
features =[]

In [None]:
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_texts = train_df['text'].values.tolist()
train_targets = train_df['category'].values.tolist()
train_targets = [ 1  if t == 'OFF' else 0 for t in train_targets]

train_df['num_words'] = train_df['text'].str.split().apply(len)

train_df.num_words.quantile(0.98)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


52.0

In [None]:
sample_text = train_texts[:64]
#need to run this in batches to extract feature for entire dataset
t = tokenizer(sample_text,max_length = 128,
                                padding = "max_length",
                                truncation = True)

In [None]:
input_ids = torch.tensor(t['input_ids'])
attention_masks = torch.tensor(t['attention_mask'])

In [None]:
with torch.no_grad():
    lhs,pooler_output = model(input_ids,attention_masks,return_dict=False)

In [None]:
print(sample_text[0])

நாயுடு இன மக்கள் சார்பாக இப்படம் வெற்றி பெற வாழ்த்துக்கள்


In [None]:
pooler_output

tensor([[ 0.2038,  0.0239,  0.1689,  ...,  0.0469, -0.1777, -0.1806],
        [ 0.1562,  0.0432,  0.1683,  ...,  0.0359, -0.1625, -0.1776],
        [-0.0627,  0.2241,  0.0246,  ..., -0.0363, -0.1432, -0.0345],
        ...,
        [ 0.0320,  0.1387,  0.0961,  ...,  0.0311, -0.1818, -0.1429],
        [ 0.2781,  0.0265,  0.2385,  ...,  0.0797, -0.1620, -0.2133],
        [ 0.1049,  0.0713,  0.1367,  ...,  0.0635, -0.2036, -0.1711]])