In [1]:
!pip install transformers
!pip install simpletransformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


English Data

### Mixing english and tamil translated to english data

In [3]:
import pandas as pd
concatenated=pd.read_csv('/content/drive/MyDrive/NLP-Project/BalencedLGBT_dataset.csv')
concatenated['category'] = concatenated['category'].replace('Non-anti-LGBT+ content', 'ally')
concatenated['category'] = concatenated['category'].replace('ally', 0)
concatenated['category'] = concatenated['category'].replace('Homophobic', 1)
concatenated['category'] = concatenated['category'].replace('Transphobic', 2)

In [4]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

df = concatenated

X = df['text']
y = df['category']

testsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

train_val_idx, test_idx = next(testsplit.split(X, y))

X_train_val, X_test = X.iloc[train_val_idx], X.iloc[test_idx]
y_train_val, y_test = y.iloc[train_val_idx], y.iloc[test_idx]


validsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

train_idx, val_idx = next(validsplit.split(X_train_val, y_train_val))

X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

train_df = pd.DataFrame({'text': X_train, 'category': y_train})
val_df = pd.DataFrame({'text': X_val, 'category': y_val})
test_df = pd.DataFrame({'text': X_test, 'category': y_test})

print(f'Training set shape: {train_df.shape}')
print(f'Validation set shape: {val_df.shape}')
print(f'Test set shape: {test_df.shape}')


Training set shape: (5331, 2)
Validation set shape: (1777, 2)
Test set shape: (1778, 2)


In [5]:
print(train_df['category'].value_counts())
print(val_df['category'].value_counts())
print(test_df['category'].value_counts())

1    1795
0    1794
2    1742
Name: category, dtype: int64
0    598
1    598
2    581
Name: category, dtype: int64
0    599
1    598
2    581
Name: category, dtype: int64


In [6]:
train_df=train_df[["text", "category"]]
train_df = train_df.rename(columns={'text': 'text', 'category': 'labels'})
val_df=val_df[["text", "category"]]
val_df = val_df.rename(columns={'text': 'text', 'category': 'labels'})
test_df=test_df[["text", "category"]]
test_df = test_df.rename(columns={'text': 'text', 'category': 'labels'})

In [7]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:

import emoji
train_df['text'] = train_df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
val_df['text'] = val_df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
test_df['text'] = test_df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))

## Training Model with Englis Data

In [9]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_args ={"reprocess_input_data": True,
             "overwrite_output_dir": True,
             "fp16":False,
             "num_train_epochs": 3,
             "max_seq_length": 128,
             "train_batch_size": 32,
             "eval_batch_size": 32,
             "logging_steps": 50,
             "save_steps": 2000,
             "learning_rate": 3e-5,
             "manual_seed": 4}

model = ClassificationModel(
    "albert", "ai4bharat/indic-bert",
    num_labels=3,
    args=train_args
)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'sop_classifier.classifier.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.bias', 'sop_classifier.classifier.weight', 'predictions.decoder.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

In [10]:
# Train the model
model.train_model(train_df)

  0%|          | 0/5331 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/167 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/167 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/167 [00:00<?, ?it/s]

(501, 0.5813534441301803)

Validation Set


In [11]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')
result, model_outputs, wrong_predictions = model.eval_model(val_df, f1=f1_multiclass, acc=accuracy_score)
result

  0%|          | 0/1777 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/56 [00:00<?, ?it/s]

{'mcc': 0.8100922942578491,
 'f1': 0.8737862996966929,
 'acc': 0.8733821046707935,
 'eval_loss': 0.36487200004713877}

Testing With English Data

In [12]:
test_df

Unnamed: 0,text,labels
1553,This is an alu puntainga karumo chiii,2
6875,"Just thinking about it, if there is a psycho, ...",0
3257,You can see the boy in the shape of a boy.. Pa...,1
2112,"third polygamists, those who eat manlike indiv...",2
7668,"{Singal tee machi u tube} Like Chanel, can you...",0
...,...,...
2679,She acts very nice. Deoxyadenosine monophospha...,2
5906,Malini sister is awesome,0
6855,It is acceptable in our culture ...,0
6867,Having no word to describe but I seen multiple...,0


In [13]:
test = test_df['text'].tolist()
labels = test_df['labels'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"Just thinking about it, if there is a psycho, ...",0,1
1,You can see the boy in the shape of a boy.. Pa...,1,0
2,Bodi Losu Club,2,1
3,Consistence comprise female. .. Simply encepha...,2,1
4,Any early than cosmos,2,1
...,...,...,...
212,I believe in today's time... The only relation...,0,1
213,Venkatesh Rajagopalan It's normal for animals ...,0,1
214,Culture and all other shit comes after nature ...,0,1
215,There is a gay lesbian combination in every li...,0,1


In [14]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

        ally       0.85      0.85      0.85       599
  homophobic       0.85      0.86      0.86       598
 transphobic       0.94      0.92      0.93       581

    accuracy                           0.88      1778
   macro avg       0.88      0.88      0.88      1778
weighted avg       0.88      0.88      0.88      1778



### Checking Tamil data performance

In [15]:
df_test=pd.read_csv('/content/eng_3_test.tsv', delimiter='\t')
df_test = df_test.rename(columns={'text                        ': 'eng'}) 
df_test = df_test.dropna()

o_test=pd.read_csv('/content/eng_tam_test.tsv', delimiter='\t') 
o_test = o_test.rename(columns={'text': 'tamil'}) 
o_test = o_test.dropna()

combined_data = pd.concat([df_test, o_test], axis=1)
combined_data = combined_data.loc[:, ~combined_data.columns.str.contains('^Unnamed')]
combined_data = combined_data.T.drop_duplicates().T
combined_data
combined_data['category'] = combined_data['category'].replace('Non-anti-LGBT+ content', 'ally')
combined_data['category'] = combined_data['category'].replace('ally', 0)
combined_data['category'] = combined_data['category'].replace('Homophobic', 1)
combined_data['category'] = combined_data['category'].replace('Transphobic', 2)
combined_data['eng'] = combined_data['eng'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data['tamil'] = combined_data['tamil'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data

Unnamed: 0,category,eng,tamil,ln
0,0,Spr....2016 poitan feel happy with my partner ...,வசந்தம்....2016 போனேன் என் பார்ட்னருடன் மகிழ்ச...,eng_tam
1,0,R u still with ur partner,நீங்கள் இன்னும் உங்கள் துணையுடன் இருக்கிறீர்கள்,eng_tam
2,0,excellent movie..no unnecessary drama or scene...,அருமையான திரைப்படம்..தேவையற்ற நாடகம் அல்லது கா...,eng_tam
3,0,"For those who speak about culture., pre marita...","கலாச்சாரத்தைப் பற்றி பேசுபவர்களுக்கு, திருமணத்...",eng_tam
4,0,Best movie and people not understand relations...,சிறந்த திரைப்படம் மற்றும் மக்கள் உறவைப் புரிந்...,eng_tam
...,...,...,...,...
985,0,Looks like Karthik took advantage on Varun whe...,வருண் தனது தாயின் முக்கிய அன்பையும் நிழலையும் ...,eng_tam
986,0,i am really crying pro crying_face crying_fac...,நான் உண்மையிலேயே அழுகிறேன் ப்ரோ அழுகை_முகம் அழ...,eng_tam
987,0,They may be transgender but don't ever forgot ...,அவர்கள் திருநங்கைகளாக இருக்கலாம் ஆனால் அவர்களு...,eng_tam
988,0,It is their own choice. I support them No doub...,அது அவர்களின் சொந்த விருப்பம். நான் அவர்களை ஆத...,eng_tam


In [16]:

test = combined_data['eng'].tolist()
org = combined_data['tamil'].tolist()
labels=combined_data['category'] .tolist()

combined_data['eng_pred'], _ = model.predict(test)
combined_data['tamil_pred'], _ = model.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['eng_pred'], combined_data['tamil_pred'])

print("similarity in predection score:",sim)
combined_data[combined_data['eng_pred'] != combined_data['tamil_pred']]


  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

similarity in predection score: 0.7727272727272727


Unnamed: 0,category,eng,tamil,ln,eng_pred,tamil_pred
6,0,99% tamilans cant understand r accept it becau...,99% இத்தாலியர்களால் அதைப் புரிந்து கொள்ளவோ ​​அ...,eng_tam,1,0
7,0,U r very clean talker... Body for mud... Only ...,நீங்கள் மிகவும் சுத்தமாகப் பேசுபவர்... உடல் சே...,eng_tam,0,1
10,0,Good movie.. All the characters are gone good....,நல்ல படம்.. எல்லா கேரக்டர்களும் நல்லா போயிடுச்...,eng_tam,0,1
13,0,Really this prank will be more powerful Feedba...,நிஜமாகவே இந்த சேட்டை சமூகத்தில் இருந்து வரும் ...,eng_tam,0,1
16,0,stop explaining her as transgender is there an...,அவளை திருநங்கை என்று விளக்குவதை நிறுத்துங்கள்,eng_tam,0,1
...,...,...,...,...,...,...
970,0,It is right 100. It is their life and their ch...,இது சரி 100. அது அவர்களின் வாழ்க்கை மற்றும் அவ...,eng_tam,0,1
971,0,Did u know that women experience more pleasure...,பெண்கள் மற்ற பெண்களுடன் உடலுறவு கொள்ளும்போது அ...,eng_tam,0,1
972,0,How u r earning subscribers without video but ...,வீடியோ இல்லாமல் சந்தாதாரர்களை எப்படி சம்பாதிக்...,eng_tam,0,1
976,0,After living in western countries Iam very sur...,மேற்கத்திய நாடுகளில் வாழ்ந்த பிறகு அவர்கள் மிக...,eng_tam,0,1


In [17]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, combined_data['tamil_pred'], target_names=target_names))

              precision    recall  f1-score   support

        ally       0.97      0.73      0.84       924
  homophobic       0.15      0.72      0.25        61
 transphobic       0.00      0.00      0.00         5

    accuracy                           0.73       990
   macro avg       0.37      0.48      0.36       990
weighted avg       0.92      0.73      0.79       990



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
test = combined_data['tamil'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"கலாச்சாரத்தைப் பற்றி பேசுபவர்களுக்கு, திருமணத்...",0,1
1,நீங்கள் மிகவும் சுத்தமாகப் பேசுபவர்... உடல் சே...,0,1
2,நல்ல படம்.. எல்லா கேரக்டர்களும் நல்லா போயிடுச்...,0,1
3,நிஜமாகவே இந்த சேட்டை சமூகத்தில் இருந்து வரும் ...,0,1
4,அவளை திருநங்கை என்று விளக்குவதை நிறுத்துங்கள்,0,1
...,...,...,...
265,இது சரி 100. அது அவர்களின் வாழ்க்கை மற்றும் அவ...,0,1
266,பெண்கள் மற்ற பெண்களுடன் உடலுறவு கொள்ளும்போது அ...,0,1
267,வீடியோ இல்லாமல் சந்தாதாரர்களை எப்படி சம்பாதிக்...,0,1
268,மேற்கத்திய நாடுகளில் வாழ்ந்த பிறகு அவர்கள் மிக...,0,1


# Training with english and testing with Hindi

In [19]:
df_test=pd.read_csv('/content/eng_3_test.tsv', delimiter='\t')
df_test = df_test.rename(columns={'text                        ': 'eng'}) 
df_test = df_test.dropna()

o_test=pd.read_csv('/content/eng_hin_test.tsv', delimiter='\t') 
o_test = o_test.rename(columns={'text': 'hindi'}) 
o_test = o_test.dropna()

combined_data = pd.concat([df_test, o_test], axis=1)
combined_data = combined_data.loc[:, ~combined_data.columns.str.contains('^Unnamed')]
combined_data = combined_data.T.drop_duplicates().T
combined_data
combined_data['category'] = combined_data['category'].replace('Non-anti-LGBT+ content', 'ally')
combined_data['category'] = combined_data['category'].replace('ally', 0)
combined_data['category'] = combined_data['category'].replace('Homophobic', 1)
combined_data['category'] = combined_data['category'].replace('Transphobic', 2)
combined_data['eng'] = combined_data['eng'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data['hindi'] = combined_data['hindi'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data

Unnamed: 0,category,eng,hindi,ln
0,0,Spr....2016 poitan feel happy with my partner ...,Spr....2016 कवि अपने पार्टनर के साथ खुश हैं,eng_hin
1,0,R u still with ur partner,क्या आप अभी भी अपने साथी के साथ हैं,eng_hin
2,0,excellent movie..no unnecessary drama or scene...,उत्कृष्ट फिल्म..कोई अनावश्यक नाटक या दृश्य नही...,eng_hin
3,0,"For those who speak about culture., pre marita...","संस्कृति की बात करने वालों के लिए, शादी से पहल...",eng_hin
4,0,Best movie and people not understand relations...,बेहतरीन फिल्म और लोग रिश्ते को नहीं समझते हैं ...,eng_hin
...,...,...,...,...
985,0,Looks like Karthik took advantage on Varun whe...,ऐसा लगता है कि कार्तिक ने वरुण पर फायदा उठाया ...,eng_hin
986,0,i am really crying pro crying_face crying_fac...,मैं सच में रो रहा हूँ चेहरा रोना_चेहरा रोना_चे...,eng_hin
987,0,They may be transgender but don't ever forgot ...,वो ट्रांसजेंडर हो सकते हैं लेकिन ये कभी नहीं भ...,eng_hin
988,0,It is their own choice. I support them No doub...,यह उनकी अपनी पसंद है। मैं उनका समर्थन करता हूं...,eng_hin


In [20]:

test = combined_data['eng'].tolist()
org = combined_data['hindi'].tolist()
labels=combined_data['category'] .tolist()

combined_data['eng_pred'], _ = model.predict(test)
combined_data['hindi_pred'], _ = model.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['eng_pred'], combined_data['hindi_pred'])

print("similarity in predection score:",sim)
combined_data[combined_data['eng_pred'] != combined_data['hindi_pred']]


  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

similarity in predection score: 0.8121212121212121


Unnamed: 0,category,eng,hindi,ln,eng_pred,hindi_pred
6,0,99% tamilans cant understand r accept it becau...,99% इटालियंस इसे समझ या स्वीकार नहीं कर सकते क...,eng_hin,1,0
9,0,Ak Be The Change I don't think it's good ..it...,एके बी द चेंज मुझे नहीं लगता कि यह अच्छा है ....,eng_hin,0,1
10,0,Good movie.. All the characters are gone good....,अच्छी फिल्म.. सारे किरदार अच्छे हो गए हैं.. हा...,eng_hin,0,1
13,0,Really this prank will be more powerful Feedba...,वास्तव में यह शरारत समाज से अधिक शक्तिशाली प्र...,eng_hin,0,1
16,0,stop explaining her as transgender is there an...,उसे ट्रांसजेंडर के रूप में समझाना बंद करें क्य...,eng_hin,0,1
...,...,...,...,...,...,...
964,0,Hats off bro advance 1millon subscriber vara ...,हैट्स ऑफ ब्रो एडवांस 1 मिलियन सब्सक्राइबर वारा...,eng_hin,0,1
967,0,Jugal Romil of course me too ...,जुगल रोमिल बेशक मैं भी,eng_hin,0,1
970,0,It is right 100. It is their life and their ch...,यह सही 100 है। यह उनका जीवन और उनकी पसंद है। म...,eng_hin,0,1
975,0,Really great akka,वाकई बढ़िया अक्का,eng_hin,0,1


In [21]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, combined_data['hindi_pred'], target_names=target_names))

              precision    recall  f1-score   support

        ally       0.96      0.80      0.88       924
  homophobic       0.16      0.56      0.25        61
 transphobic       0.00      0.00      0.00         5

    accuracy                           0.78       990
   macro avg       0.37      0.45      0.37       990
weighted avg       0.91      0.78      0.83       990



In [22]:
test = combined_data['hindi'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"संस्कृति की बात करने वालों के लिए, शादी से पहल...",0,1
1,एके बी द चेंज मुझे नहीं लगता कि यह अच्छा है ....,0,1
2,अच्छी फिल्म.. सारे किरदार अच्छे हो गए हैं.. हा...,0,1
3,वास्तव में यह शरारत समाज से अधिक शक्तिशाली प्र...,0,1
4,उसे ट्रांसजेंडर के रूप में समझाना बंद करें क्य...,0,1
...,...,...,...
208,अनीश कूल एक कानून जो सिर्फ 70 साल की अवधि में ...,0,1
209,जुगल रोमिल बेशक मैं भी,0,1
210,यह सही 100 है। यह उनका जीवन और उनकी पसंद है। म...,0,1
211,वाकई बढ़िया अक्का,0,1


# Training with english and testing with Telugu

In [23]:
df_test=pd.read_csv('/content/eng_3_test.tsv', delimiter='\t')
df_test = df_test.rename(columns={'text                        ': 'eng'}) 
df_test = df_test.dropna()

o_test=pd.read_csv('/content/eng_tel_test.tsv', delimiter='\t') 
o_test = o_test.rename(columns={'text': 'tel'}) 
o_test = o_test.dropna()

combined_data = pd.concat([df_test, o_test], axis=1)
combined_data = combined_data.loc[:, ~combined_data.columns.str.contains('^Unnamed')]
combined_data = combined_data.T.drop_duplicates().T
combined_data
combined_data['category'] = combined_data['category'].replace('Non-anti-LGBT+ content', 'ally')
combined_data['category'] = combined_data['category'].replace('ally', 0)
combined_data['category'] = combined_data['category'].replace('Homophobic', 1)
combined_data['category'] = combined_data['category'].replace('Transphobic', 2)
combined_data['eng'] = combined_data['eng'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data['tel'] = combined_data['tel'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data

Unnamed: 0,category,eng,tel,ln
0,0,Spr....2016 poitan feel happy with my partner ...,వసంతం....2016 పోయిటన్ నా భాగస్వామితో సంతోషంగా ...,eng_tel
1,0,R u still with ur partner,మీరు ఇప్పటికీ మీ భాగస్వామితో ఉన్నారు,eng_tel
2,0,excellent movie..no unnecessary drama or scene...,అద్భుతమైన సినిమా..అనవసరమైన డ్రామా లేదా సన్నివే...,eng_tel
3,0,"For those who speak about culture., pre marita...","సంస్కృతి గురించి మాట్లాడే వారికి, వివాహానికి మ...",eng_tel
4,0,Best movie and people not understand relations...,ఉత్తమ చిత్రం మరియు వ్యక్తులు సంబంధాన్ని అర్థం ...,eng_tel
...,...,...,...,...
985,0,Looks like Karthik took advantage on Varun whe...,కార్తీక్ వరుణ్ తన తల్లి నుండి తన ప్రధాన ప్రేమన...,eng_tel
986,0,i am really crying pro crying_face crying_fac...,నేను నిజంగా ఏడుస్తున్నాను ప్రో క్రయింగ్_ఫేస్ క...,eng_tel
987,0,They may be transgender but don't ever forgot ...,వారు లింగమార్పిడి కావచ్చు కానీ వారు కూడా మనుషు...,eng_tel
988,0,It is their own choice. I support them No doub...,అది వారి స్వంత ఎంపిక. నేను వారికి మద్దతు ఇస్తు...,eng_tel


In [24]:

test = combined_data['eng'].tolist()
org = combined_data['tel'].tolist()
labels=combined_data['category'] .tolist()

combined_data['eng_pred'], _ = model.predict(test)
combined_data['tel_pred'], _ = model.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['eng_pred'], combined_data['tel_pred'])

print("similarity in predection score:",sim)
combined_data[combined_data['eng_pred'] != combined_data['tel_pred']]


  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

similarity in predection score: 0.7686868686868686


Unnamed: 0,category,eng,tel,ln,eng_pred,tel_pred
6,0,99% tamilans cant understand r accept it becau...,99% ఇటాలియన్లు దీనిని అర్థం చేసుకోలేరు లేదా అం...,eng_tel,1,0
9,0,Ak Be The Change I don't think it's good ..it...,అక్ బీ ది చేంజ్ ఇది మంచిది కాదని నేను అనుకుంట...,eng_tel,0,1
10,0,Good movie.. All the characters are gone good....,మంచి సినిమా.. పాత్రలన్నీ బాగానే వచ్చాయి.. అవున...,eng_tel,0,1
16,0,stop explaining her as transgender is there an...,లింగమార్పిడి అని ఆమెను వివరించడం మానేయండి,eng_tel,0,1
27,0,Super anna..Verra level..All are have red bloo...,సూపర్ అన్నా..వెర్ర లెవెల్..అందరికీ ఎర్రటి రక్త...,eng_tel,0,1
...,...,...,...,...,...,...
970,0,It is right 100. It is their life and their ch...,ఇది సరైనది 100. ఇది వారి జీవితం మరియు వారి ఎంప...,eng_tel,0,1
971,0,Did u know that women experience more pleasure...,స్త్రీలు ఇతర స్త్రీలతో సెక్స్ చేసినప్పుడు ఎక్క...,eng_tel,0,1
972,0,How u r earning subscribers without video but ...,మీరు వీడియో లేకుండా సబ్‌స్క్రైబర్‌లను ఎలా సంపా...,eng_tel,0,1
976,0,After living in western countries Iam very sur...,పాశ్చాత్య దేశాలలో నివసించిన తర్వాత వారు మరింత ...,eng_tel,0,1


In [25]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, combined_data['tel_pred'], target_names=target_names))

              precision    recall  f1-score   support

        ally       0.97      0.74      0.84       924
  homophobic       0.16      0.72      0.26        61
 transphobic       0.00      0.00      0.00         5

    accuracy                           0.74       990
   macro avg       0.38      0.49      0.37       990
weighted avg       0.92      0.74      0.80       990



In [26]:
test = combined_data['tel'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"సంస్కృతి గురించి మాట్లాడే వారికి, వివాహానికి మ...",0,1
1,అక్ బీ ది చేంజ్ ఇది మంచిది కాదని నేను అనుకుంట...,0,1
2,మంచి సినిమా.. పాత్రలన్నీ బాగానే వచ్చాయి.. అవున...,0,1
3,లింగమార్పిడి అని ఆమెను వివరించడం మానేయండి,0,1
4,సూపర్ అన్నా..వెర్ర లెవెల్..అందరికీ ఎర్రటి రక్త...,0,1
...,...,...,...
254,మీరు వీడియో లేకుండా సబ్‌స్క్రైబర్‌లను ఎలా సంపా...,0,1
255,పాశ్చాత్య దేశాలలో నివసించిన తర్వాత వారు మరింత ...,0,1
256,4 అరివు అతిహమ్ బ్రో... అరివే ఇల్లతవన్ కుడుత తీ...,0,1
257,చెత్త తిరునంగై ఇడియట్ చె గో వే వే,0,1


# Training with english and testing with Malyalam

In [27]:
df_test=pd.read_csv('/content/eng_3_test.tsv', delimiter='\t')
df_test = df_test.rename(columns={'text                        ': 'eng'}) 
df_test = df_test.dropna()

o_test=pd.read_csv('/content/eng_ml_test.tsv', delimiter='\t') 
o_test = o_test.rename(columns={'text': 'mal'}) 
o_test = o_test.dropna()

combined_data = pd.concat([df_test, o_test], axis=1)
combined_data = combined_data.loc[:, ~combined_data.columns.str.contains('^Unnamed')]
combined_data = combined_data.T.drop_duplicates().T
combined_data
combined_data['category'] = combined_data['category'].replace('Non-anti-LGBT+ content', 'ally')
combined_data['category'] = combined_data['category'].replace('ally', 0)
combined_data['category'] = combined_data['category'].replace('Homophobic', 1)
combined_data['category'] = combined_data['category'].replace('Transphobic', 2)
combined_data['eng'] = combined_data['eng'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data['mal'] = combined_data['mal'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))
combined_data

Unnamed: 0,category,eng,mal,ln
0,0,Spr....2016 poitan feel happy with my partner ...,Spr....2016 പോയതാൻ എന്റെ പങ്കാളിയുമായി സന്തോഷമ...,eng_ml
1,0,R u still with ur partner,നിങ്ങൾ ഇപ്പോഴും നിങ്ങളുടെ പങ്കാളിക്കൊപ്പമാണ്,eng_ml
2,0,excellent movie..no unnecessary drama or scene...,മികച്ച സിനിമ.. അനാവശ്യ നാടകങ്ങളോ രംഗങ്ങളോ ഇല്ല...,eng_ml
3,0,"For those who speak about culture., pre marita...","സംസ്കാരത്തെക്കുറിച്ച് സംസാരിക്കുന്നവർക്ക്, വിവ...",eng_ml
4,0,Best movie and people not understand relations...,മികച്ച സിനിമയും ആളുകൾക്ക് ബന്ധം മനസ്സിലാകാത്തത...,eng_ml
...,...,...,...,...
985,0,Looks like Karthik took advantage on Varun whe...,അമ്മയിൽ നിന്നുള്ള പ്രധാന സ്നേഹവും നിഴലുകളും നഷ...,eng_ml
986,0,i am really crying pro crying_face crying_fac...,ഞാൻ ശരിക്കും കരയുന്നു പ്രോ കരയുന്നു_മുഖം കരയുന...,eng_ml
987,0,They may be transgender but don't ever forgot ...,അവർ ട്രാൻസ്‌ജെൻഡർ ആയിരിക്കാം പക്ഷെ അവരും മനുഷ്...,eng_ml
988,0,It is their own choice. I support them No doub...,അത് അവരുടെ സ്വന്തം തിരഞ്ഞെടുപ്പാണ്. ഞാൻ അവരെ പ...,eng_ml


In [28]:

test = combined_data['eng'].tolist()
org = combined_data['mal'].tolist()
labels=combined_data['category'] .tolist()

combined_data['eng_pred'], _ = model.predict(test)
combined_data['mal_pred'], _ = model.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['eng_pred'], combined_data['mal_pred'])

print("similarity in predection score:",sim)
combined_data[combined_data['eng_pred'] != combined_data['mal_pred']]


  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

similarity in predection score: 0.7303030303030303


Unnamed: 0,category,eng,mal,ln,eng_pred,mal_pred
6,0,99% tamilans cant understand r accept it becau...,99% ഇറ്റലിക്കാർക്കും ഇത് മനസ്സിലാക്കാനോ അംഗീകര...,eng_ml,1,0
7,0,U r very clean talker... Body for mud... Only ...,നിങ്ങൾ വളരെ വൃത്തിയുള്ള സംസാരക്കാരൻ... ശരീരം ച...,eng_ml,0,1
9,0,Ak Be The Change I don't think it's good ..it...,Ak Be The Change ഇത് നല്ലതാണെന്ന് എനിക്ക് തോന...,eng_ml,0,1
10,0,Good movie.. All the characters are gone good....,നല്ല സിനിമ.. എല്ലാ കഥാപാത്രങ്ങളും നന്നായി പോയി...,eng_ml,0,1
16,0,stop explaining her as transgender is there an...,ട്രാൻസ്‌ജെൻഡർ ആണെന്ന് അവളെ വിശദീകരിക്കുന്നത് ന...,eng_ml,0,1
...,...,...,...,...,...,...
972,0,How u r earning subscribers without video but ...,വീഡിയോ ഇല്ലാതെ നിങ്ങൾ എങ്ങനെ സബ്‌സ്‌ക്രൈബർമാരെ...,eng_ml,0,1
980,0,4 arivu athiham bro... Arivey illathavan kudut...,4 അറിവ് അതിഹം ബ്രോ... അറിവ് ഇല്ലതവൻ കുടുത തീർപ...,eng_ml,0,1
982,0,Worst thirunangai idiot che go way ...,ഏറ്റവും മോശം തിരുനംഗൈ ഇഡിയറ്റ് ചേ ഗോ വേ,eng_ml,1,0
983,0,Hats off handshake .... Semma semma finally t...,ഹാറ്റ്‌സ് ഓഫ് ഹാൻഡ്‌ഷേക്ക് .... സെമ്മ സെമ്മ ഒട...,eng_ml,0,1


In [29]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, combined_data['mal_pred'], target_names=target_names))

              precision    recall  f1-score   support

        ally       0.98      0.69      0.81       924
  homophobic       0.15      0.82      0.25        61
 transphobic       0.00      0.00      0.00         5

    accuracy                           0.69       990
   macro avg       0.38      0.50      0.35       990
weighted avg       0.92      0.69      0.77       990



In [30]:
test = combined_data['mal'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"സംസ്കാരത്തെക്കുറിച്ച് സംസാരിക്കുന്നവർക്ക്, വിവ...",0,1
1,നിങ്ങൾ വളരെ വൃത്തിയുള്ള സംസാരക്കാരൻ... ശരീരം ച...,0,1
2,Ak Be The Change ഇത് നല്ലതാണെന്ന് എനിക്ക് തോന...,0,1
3,നല്ല സിനിമ.. എല്ലാ കഥാപാത്രങ്ങളും നന്നായി പോയി...,0,1
4,ട്രാൻസ്‌ജെൻഡർ ആണെന്ന് അവളെ വിശദീകരിക്കുന്നത് ന...,0,1
...,...,...,...
300,ഇത് ശരിയാണ് 100. അത് അവരുടെ ജീവിതവും അവരുടെ തി...,0,1
301,മറ്റ് സ്ത്രീകളുമായി ലൈംഗിക ബന്ധത്തിൽ ഏർപ്പെടുമ...,0,1
302,വീഡിയോ ഇല്ലാതെ നിങ്ങൾ എങ്ങനെ സബ്‌സ്‌ക്രൈബർമാരെ...,0,1
303,4 അറിവ് അതിഹം ബ്രോ... അറിവ് ഇല്ലതവൻ കുടുത തീർപ...,0,1


Here we rephrased few sentences and tested on our model to analysie its performance

Robustness test

In [31]:

sentences = [
"homophobic are individuals who support and advocate for marginalized communities, even if they do not belong to those communities themselves.",
"Being an ally requires more than just a willingness to help - it requires active listening, learning, and action.",
"Allies can play a critical role in advancing social justice and creating more equitable societies.",
"Allies should always prioritize the needs and voices of marginalized communities over their own.",
"Allies should use their privilege to uplift and amplify the voices of marginalized individuals.",
"Allies should be willing to engage in uncomfortable conversations and challenge their own biases and assumptions.",
"homophobic is the irrational fear or hatred of transgender people.",
"Discrimination against transgender individuals is a form of transphobia.",
"Using the wrong pronouns to refer to a transgender person is an act of transphobia.",
"Transphobia can have serious negative impacts on the mental health of transgender individuals.",
"Allies of the transgender community should work to combat transphobia wherever they encounter it.",
"Transphobia is a form of prejudice that has no place in a just and equitable society.",
"Transphobia often stems from a lack of understanding or empathy for the experiences of transgender individuals.",
"Transphobia can manifest in many different ways, from physical violence to verbal harassment to exclusion from certain spaces.",
"Transphobia is not just a problem for transgender individuals, but for society as a whole.",
"By standing up against transphobia, we can create a world where everyone is accepted for who they are.",
]
labels = [1,0,0,0,0,0,1,2,2,2,2,2,2,2,2,2]


test = sentences
labels = labels
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
mis

Unnamed: 0,text,true_label,predicted_label
0,Being an ally requires more than just a willin...,0,1
1,Allies can play a critical role in advancing s...,0,1
2,Allies should be willing to engage in uncomfor...,0,1
3,homophobic is the irrational fear or hatred of...,1,0
4,Discrimination against transgender individuals...,2,1
5,Using the wrong pronouns to refer to a transge...,2,1
6,Transphobia can have serious negative impacts ...,2,1
7,Transphobia is a form of prejudice that has no...,2,1
8,Transphobia often stems from a lack of underst...,2,1
9,Transphobia can manifest in many different way...,2,1


In [33]:

sentences = [
"homophobic are individuals who support and advocate for marginalized communities, even if they do not belong to those communities themselves.",
"Being an ally requires more than just a willingness to help - it requires active listening, learning, and action.",
"Allies can play a critical role in advancing social justice and creating more equitable societies.",
"Allies should always prioritize the needs and voices of marginalized communities over their own.",
"Allies should use their privilege to uplift and amplify the voices of marginalized individuals.",
"Allies should be willing to engage in uncomfortable conversations and challenge their own biases and assumptions.",
"homophobic is the irrational fear or hatred of transgender people.",
"Discrimination against transgender individuals is a form of transphobia.",
"Using the wrong pronouns to refer to a transgender person is an act of transphobia.",
"Transphobia can have serious negative impacts on the mental health of transgender individuals.",
"Allies of the transgender community should work to combat transphobia wherever they encounter it.",
"Transphobia is a form of prejudice that has no place in a just and equitable society.",
"Transphobia often stems from a lack of understanding or empathy for the experiences of transgender individuals.",
"Transphobia can manifest in many different ways, from physical violence to verbal harassment to exclusion from certain spaces.",
"Transphobia is not just a problem for transgender individuals, but for society as a whole.",
"By standing up against transphobia, we can create a world where everyone is accepted for who they are.",
]
labels = [1,0,0,0,0,0,1,2,2,2,2,2,2,2,2,2]


test = sentences
labels = labels
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [34]:



sentences = [
    
"Homophobia is the irrational fear or hatred of LGBTQ+ individuals, specifically those who are attracted to people of the same gender.",
"Homophobia can take many forms, including bullying, harassment, discrimination, and violence.",
"Homophobia can have serious negative impacts on the mental health and well-being of LGBTQ+ individuals.",
"Allies of the LGBTQ+ community should work to combat homophobia wherever they encounter it.",
"Homophobia is a form of prejudice that has no place in a just and equitable society.",
"Homophobia often stems from a lack of understanding or empathy for the experiences of LGBTQ+ individuals.",
"Homophobia can manifest in many different ways, from derogatory language to physical violence to exclusion from certain spaces.",
"Homophobia is not just a problem for LGBTQ+ individuals, but for society as a whole.",
"By standing up against homophobia, we can create a world where everyone is free to love who they choose.",
"Homophobia is not only harmful, it is also illogical and unjustified."
]
labels = [1,1,1,0,1,1,1,1,1,1]


test = sentences
labels = labels
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,Homophobia is the irrational fear or hatred of...,1,0
1,Homophobia is not just a problem for LGBTQ+ in...,1,0


In [35]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [36]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import random

data = test_df.copy()
syn_aug = naw.SynonymAug(aug_src='wordnet')
lis = []
for index, row in data.iterrows():
    text = syn_aug.augment(row['text'], n=1)
    lis.extend(list(zip(text)))
tf = pd.DataFrame(lis, columns=['Changed_text'])

data = data.reset_index(drop=True)
tf = tf.reset_index(drop=True)

combined_data = pd.concat([data, tf], axis=1)
combined_data

Unnamed: 0,text,labels,Changed_text
0,This is an alu puntainga karumo chiii,2,This be an alu puntainga karumo chiii
1,"Just thinking about it, if there is a psycho, ...",0,"Barely imagine astir it, if there personify a ..."
2,You can see the boy in the shape of a boy.. Pa...,1,You hind end see the boy in the soma of a boy....
3,"third polygamists, those who eat manlike indiv...",2,"3rd polygamists, those world health organizati..."
4,"{Singal tee machi u tube} Like Chanel, can you...",0,"{ Singal tee machi u tube} Like Chanel, can yo..."
...,...,...,...
1773,She acts very nice. Deoxyadenosine monophospha...,2,She play very nice. Deoxyadenosine monophospha...
1774,Malini sister is awesome,0,Malini sister be awful
1775,It is acceptable in our culture ...,0,It is satisfactory in our acculturation
1776,Having no word to describe but I seen multiple...,0,Having no word to describe but I seen multiple...


In [37]:
test = combined_data['Changed_text'].tolist()
org = combined_data['text'].tolist()
labels = combined_data['labels'].tolist()

combined_data['Changed_text_labels'], _ = model.predict(test)
combined_data['text_labels'], _ = model.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['Changed_text_labels'], combined_data['text_labels'])

print("similarity in predection score:",sim)
combined_data[combined_data['Changed_text_labels'] != combined_data['text_labels']]

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

similarity in predection score: 0.8397075365579303


Unnamed: 0,text,labels,Changed_text,Changed_text_labels,text_labels
2,You can see the boy in the shape of a boy.. Pa...,1,You hind end see the boy in the soma of a boy....,1,0
4,"{Singal tee machi u tube} Like Chanel, can you...",0,"{ Singal tee machi u tube} Like Chanel, can yo...",2,0
5,That's good girl,0,That ' s proficient young woman,1,0
6,J. devi jahir J. devi jahir Non all men kill. ...,2,J. devi jahir J. devi jahir Non all men toss o...,0,2
10,The young woman who deplete up overawe curry,2,The unseasoned fair sex who deplete up overawe...,1,2
...,...,...,...,...,...
1748,You need to be aware of the people who are att...,0,You demand to constitute aware of the mass who...,1,0
1763,Those who said it shoul be between male and fe...,0,Those who said it shoul be between male and fe...,1,0
1767,Transgenders have a good heart...these women d...,0,Transgenders make a good heart and soul. .. th...,1,0
1771,I appreciate your open talk.in reality how man...,0,Ane revalue your open talk. in world how many ...,1,0


In [38]:
test = combined_data['Changed_text'].tolist()
labels = combined_data['labels'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})
mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"Barely imagine astir it, if there personify a ...",0,1
1,"{ Singal tee machi u tube} Like Chanel, can yo...",0,2
2,That ' s proficient young woman,0,1
3,J. devi jahir J. devi jahir Non all men toss o...,2,0
4,The unseasoned fair sex who deplete up overawe...,2,1
...,...,...,...
355,There is a gay lesbian combination in every li...,0,1
356,Transgenders make a good heart and soul. .. th...,0,1
357,Ane revalue your open talk. in world how many ...,0,1
358,She play very nice. Deoxyadenosine monophospha...,2,1


In [39]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

        ally       0.87      0.60      0.71       599
  homophobic       0.70      0.87      0.78       598
 transphobic       0.86      0.93      0.89       581

    accuracy                           0.80      1778
   macro avg       0.81      0.80      0.79      1778
weighted avg       0.81      0.80      0.79      1778



In [40]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import random

data = test_df.copy()
char_aug = nac.RandomCharAug(action="insert")
lis = []
for index, row in data.iterrows():
    text = char_aug.augment(row['text'], n=1)
    lis.extend(list(zip(text)))
tf = pd.DataFrame(lis, columns=['Changed_text'])

data = data.reset_index(drop=True)
tf = tf.reset_index(drop=True)

combined_data = pd.concat([data, tf], axis=1)
combined_data

Unnamed: 0,text,labels,Changed_text
0,This is an alu puntainga karumo chiii,2,TShims is an alu puntainga ka3rulmo chiDiMi
1,"Just thinking about it, if there is a psycho, ...",0,"J5usit thinRkiMnDg about it, if there is a Gps..."
2,You can see the boy in the shape of a boy.. Pa...,1,You can see the boy in the psha+pe of a boy. ....
3,"third polygamists, those who eat manlike indiv...",2,"thilrkd polygamists, those who eat manql&ikje ..."
4,"{Singal tee machi u tube} Like Chanel, can you...",0,{ SgingHal tee Fmauchi u tu0bpe} BLikce ChaMne...
...,...,...,...
1773,She acts very nice. Deoxyadenosine monophospha...,2,She acts very nice. DeoTxyaYdenlosYin!e monoph...
1774,Malini sister is awesome,0,MUalin#i si_st0er is awesome
1775,It is acceptable in our culture ...,0,It is acAc^eptRable in our cuvltcu%re
1776,Having no word to describe but I seen multiple...,0,Having no word to describe but I seen multiple...


In [41]:
test = combined_data['Changed_text'].tolist()
org = combined_data['text'].tolist()
labels = combined_data['labels'].tolist()

combined_data['Changed_text_labels'], _ = model.predict(test)
combined_data['text_labels'], _ = model.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['Changed_text_labels'], combined_data['text_labels'])

print("similarity in predection score:",sim)
combined_data[combined_data['Changed_text_labels'] != combined_data['text_labels']]

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

similarity in predection score: 0.6338582677165354


Unnamed: 0,text,labels,Changed_text,Changed_text_labels,text_labels
1,"Just thinking about it, if there is a psycho, ...",0,"J5usit thinRkiMnDg about it, if there is a Gps...",0,1
2,You can see the boy in the shape of a boy.. Pa...,1,You can see the boy in the psha+pe of a boy. ....,1,0
3,"third polygamists, those who eat manlike indiv...",2,"thilrkd polygamists, those who eat manql&ikje ...",0,2
4,"{Singal tee machi u tube} Like Chanel, can you...",0,{ SgingHal tee Fmauchi u tu0bpe} BLikce ChaMne...,2,0
6,J. devi jahir J. devi jahir Non all men kill. ...,2,J. devi ja!hPir J. devi jahir Non all men kill...,0,2
...,...,...,...,...,...
1759,Can you send these drooling dog to Jaffna? We ...,2,Can you s7en@d #theWse &droolTinqg dog to !Jaf...,0,2
1760,Correct answer to the question he asks,0,7Corcrerct aCnsDwer to the question he aNskzs,2,0
1767,Transgenders have a good heart...these women d...,0,Transgenders h)avSe a IgooQd heart. .. Hthe%se...,1,0
1772,"Aravanis embody societal ruiner, ..",2,"Arava#ntifs embody socWielta2l ruinmejr, ..",1,2


In [42]:
test = combined_data['Changed_text'].tolist()
labels = combined_data['labels'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})
mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/1778 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"thilrkd polygamists, those who eat manql&ikje ...",2,0
1,{ SgingHal tee Fmauchi u tu0bpe} BLikce ChaMne...,0,2
2,J. devi ja!hPir J. devi jahir Non all men kill...,2,0
3,Chandra Le1kVha,2,1
4,Pjriyadh)aErKshni K Bodi Venna ok go have sex ...,1,0
...,...,...,...
672,There is a gay lesbian combination in e%v+ery ...,0,1
673,Transgenders h)avSe a IgooQd heart. .. Hthe%se...,0,1
674,"Arava#ntifs embody socWielta2l ruinmejr, ..",2,1
675,She acts very nice. DeoTxyaYdenlosYin!e monoph...,2,0


In [43]:
from sklearn.metrics import classification_report
target_names = ['ally', 'homophobic', 'transphobic']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

        ally       0.55      0.74      0.63       599
  homophobic       0.66      0.64      0.65       598
 transphobic       0.70      0.48      0.57       581

    accuracy                           0.62      1778
   macro avg       0.64      0.62      0.62      1778
weighted avg       0.64      0.62      0.62      1778

