In [None]:
!pip install transformers
!pip install simpletransformers

English Data

In [None]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/Sentiment-analysis/IMDB Dataset.csv') 

In [None]:
print(df['sentiment'].value_counts())

positive    25000
negative    25000
Name: sentiment, dtype: int64


In [None]:
df['sentiment'] = df['sentiment'].replace('negative', 0)
df['sentiment'] = df['sentiment'].replace('positive', 1)

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit


X = df['review']
y = df['sentiment']

testsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

train_val_idx, test_idx = next(testsplit.split(X, y))

X_train_val, X_test = X.iloc[train_val_idx], X.iloc[test_idx]
y_train_val, y_test = y.iloc[train_val_idx], y.iloc[test_idx]


validsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

train_idx, val_idx = next(validsplit.split(X_train_val, y_train_val))

X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

train_df = pd.DataFrame({'text': X_train, 'category': y_train})
val_df = pd.DataFrame({'text': X_val, 'category': y_val})
test_df = pd.DataFrame({'text': X_test, 'category': y_test})

print(f'Training set shape: {train_df.shape}')
print(f'Validation set shape: {val_df.shape}')
print(f'Test set shape: {test_df.shape}')


Training set shape: (30000, 2)
Validation set shape: (10000, 2)
Test set shape: (10000, 2)


### Mixing english and tamil translated to english data

In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [None]:
print(train_df['category'].value_counts())
print(val_df['category'].value_counts())
print(test_df['category'].value_counts())

1    15000
0    15000
Name: category, dtype: int64
1    5000
0    5000
Name: category, dtype: int64
0    5000
1    5000
Name: category, dtype: int64


In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_args ={"reprocess_input_data": True,
             "overwrite_output_dir": True,
             "fp16":False,
             "num_train_epochs": 3,
             "max_seq_length": 128,
             "train_batch_size": 32,
             "eval_batch_size": 32,
             "logging_steps": 50,
             "save_steps": 2000,
             "learning_rate": 3e-6,
             "manual_seed": 4}


model = ClassificationModel(
    "bert", "bert-base-multilingual-cased",
    num_labels=2,
    args=train_args
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
# Train the model
model.train_model(train_df)



  0%|          | 0/30000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/938 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/938 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/938 [00:00<?, ?it/s]

(2814, 0.3842183563543196)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')
result, model_outputs, wrong_predictions = model.eval_model(val_df, f1=f1_multiclass, acc=accuracy_score)
result



  0%|          | 0/10000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/313 [00:00<?, ?it/s]

{'mcc': 0.6968616370371868,
 'tp': 4310,
 'tn': 4173,
 'fp': 827,
 'fn': 690,
 'auroc': 0.92918624,
 'auprc': 0.9275383194464738,
 'f1': 0.8482715220819796,
 'acc': 0.8483,
 'eval_loss': 0.3532210279006166}

In [None]:
test_df

Unnamed: 0,text,category
18870,"Yes, MTV there really is a way to market Daria...",0
39791,The story of the bride fair is an amusing and ...,0
30381,"A team varied between Scully and Mulder, two o...",1
42294,This was a popular movie probably because of t...,0
33480,This movie made me so angry!! Here I am thinki...,0
...,...,...
3634,SILVER CITY (2+ outta 5 stars) As a huge fan o...,0
47910,Moscow Zero stole my money and I want it back!...,0
16086,This is the only film I've seen that is made b...,0
48294,"This is a story about Shin-ae, who moves to Mi...",1


In [None]:
test = test_df['text'].tolist()
labels = test_df['category'].tolist()
predictions, _ = model.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,The story of the bride fair is an amusing and ...,0,1
1,This was a popular movie probably because of t...,0,1
2,"I will stat of with the plot Alice, having sur...",0,1
3,There is a uk edition to this show which is ra...,0,1
4,Interferencia starts as unemployed Martin Sand...,0,1
...,...,...,...
1462,This movie plays out like an English version o...,0,1
1463,"Okay, first of all I got this movie as a Chris...",1,0
1464,My main comment on this movie is how Zwick was...,0,1
1465,This is another Bollywood remake of a Hollywoo...,0,1


In [None]:
from sklearn.metrics import classification_report
target_names = ['negitive', 'positive']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

    negitive       0.86      0.84      0.85      5000
    positive       0.84      0.87      0.86      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [None]:
!pip install torch

In [None]:
import torch
#torch.save(model, '/content/drive/MyDrive/Sentiment-analysis/sentiment_analysis-imdb-mbert')

In [None]:
import torch
m = torch.load('/content/drive/MyDrive/Sentiment-analysis/sentiment_analysis-imdb-mbert')

In [None]:
m

<simpletransformers.classification.classification_model.ClassificationModel at 0x7f83b55351e0>

## Robustness Analysis

In [None]:
!pip install nlpaug

Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


##Synonymn aug

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import random

data = test_df.copy()
syn_aug = naw.SynonymAug(aug_src='wordnet')
lis = []
for index, row in data.iterrows():
    text = syn_aug.augment(row['text'], n=1)
    lis.extend(list(zip(text)))
tf = pd.DataFrame(lis, columns=['Changed_text'])

data = data.reset_index(drop=True)
tf = tf.reset_index(drop=True)

combined_data = pd.concat([data, tf], axis=1)
combined_data

Unnamed: 0,text,category,Changed_text
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria..."
1,The story of the bride fair is an amusing and ...,0,The story of the bride fair be an amusing and ...
2,"A team varied between Scully and Mulder, two o...",1,"A team varied between Scully and Mulder, two o..."
3,This was a popular movie probably because of t...,0,This was a popular movie probably because of t...
4,This movie made me so angry!! Here I am thinki...,0,This movie made me so angry! ! Here I am think...
...,...,...,...
9995,SILVER CITY (2+ outta 5 stars) As a huge fan o...,0,SILVER CITY (2 + outta 5 stars) As a huge fan ...
9996,Moscow Zero stole my money and I want it back!...,0,Moscow Zero stole my money and I want it back!...
9997,This is the only film I've seen that is made b...,0,This is the only film I ' ve seen that is made...
9998,"This is a story about Shin-ae, who moves to Mi...",1,"This is a story about Shin - ae, who moves to ..."


In [None]:
test = combined_data['Changed_text'].tolist()
org = combined_data['text'].tolist()
labels = combined_data['category'].tolist()

combined_data['Changed_text_labels'], _ = m.predict(test)
combined_data['text_labels'], _ = m.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['Changed_text_labels'], combined_data['text_labels'])

print("similarity in predection score:",sim)
combined_data[combined_data['Changed_text_labels'] != combined_data['text_labels']]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

similarity in predection score: 0.965


Unnamed: 0,text,category,Changed_text,Changed_text_labels,text_labels
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria...",1,0
6,"I will stat of with the plot Alice, having sur...",0,"I will stat of with the plot Alice, having sur...",0,1
11,I went to this film having no idea what to exp...,1,I went to this film having no idea what to exp...,0,1
22,"This was not a very good movie, the acting pre...",0,"This was not a very good movie, the acting pre...",0,1
41,Norm(an)ally I don't mind remakes. There are s...,0,Norm (an) ally I don ' t creative thinker rema...,0,1
...,...,...,...,...,...
9936,This is one of the most guilty pleasure movies...,1,This is one of the most shamefaced pleasure mo...,1,0
9952,"Anyone who does not find this movie funny, doe...",1,"Anyone who does non find this picture funny, d...",0,1
9953,"This is a strange, little, forgotten movie fro...",1,"This be a foreign, little, disregarded flick f...",0,1
9972,Jon Stewart (aka John Liebowitz) constantly ri...,0,Jon Stewart (aka John Liebowitz) constantly ri...,0,1


In [None]:
test = combined_data['Changed_text'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})
mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"Yes, MTV there really is a way to market Daria...",0,1
1,The story of the bride fair be an amusing and ...,0,1
2,This was a popular movie probably because of t...,0,1
3,Thither is a uk edition to this show which is ...,0,1
4,I went to this film having no idea what to exp...,1,0
...,...,...,...
1532,"Okay, first of all Iodine got this movie as a ...",1,0
1533,My main comment on this movie is how Zwick was...,0,1
1534,This is another Bollywood remake of a Hollywoo...,0,1
1535,"Only saw this show a few times, but will exist...",1,0


In [None]:
from sklearn.metrics import classification_report
target_names = ['neg','pos']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

         neg       0.84      0.85      0.85      5000
         pos       0.85      0.84      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



## RandomCharAug

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import random

data = test_df.copy()
char_aug = nac.RandomCharAug(action="insert")
lis = []
for index, row in data.iterrows():
    text = char_aug.augment(row['text'], n=1)
    lis.extend(list(zip(text)))
tf = pd.DataFrame(lis, columns=['Changed_text'])

data = data.reset_index(drop=True)
tf = tf.reset_index(drop=True)

combined_data = pd.concat([data, tf], axis=1)
combined_data

Unnamed: 0,text,category,Changed_text
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria..."
1,The story of the bride fair is an amusing and ...,0,The story of the bCr&ide fair is an amusing an...
2,"A team varied between Scully and Mulder, two o...",1,"A team varied ^betwebesn Scully and Mulder, tw..."
3,This was a popular movie probably because of t...,0,This was a popular movie probably because of t...
4,This movie made me so angry!! Here I am thinki...,0,This movie made me so angry! ! HeIree I am thi...
...,...,...,...
9995,SILVER CITY (2+ outta 5 stars) As a huge fan o...,0,SILVER CITY (2 + outta 5 stars) As a huge fan ...
9996,Moscow Zero stole my money and I want it back!...,0,Moscow Zero stole my money and I want it back!...
9997,This is the only film I've seen that is made b...,0,This is the only film I ' ve seen that is made...
9998,"This is a story about Shin-ae, who moves to Mi...",1,"This is a story about Shin - ae, who moves to ..."


In [None]:
test = combined_data['Changed_text'].tolist()
org = combined_data['text'].tolist()
labels = combined_data['category'].tolist()

combined_data['Changed_text_labels'], _ = m.predict(test)
combined_data['text_labels'], _ = m.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['Changed_text_labels'], combined_data['text_labels'])

print("similarity in predection score:",sim)
combined_data[combined_data['Changed_text_labels'] != combined_data['text_labels']]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

similarity in predection score: 0.9559


Unnamed: 0,text,category,Changed_text,Changed_text_labels,text_labels
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria...",1,0
6,"I will stat of with the plot Alice, having sur...",0,"I will stat of with the plot Alice, having the...",0,1
8,There is a uk edition to this show which is ra...,0,There is a uk edition to this show which rathe...,0,1
22,"This was not a very good movie, the acting pre...",0,"This was a very good, the acting pretty much s...",0,1
41,Norm(an)ally I don't mind remakes. There are s...,0,Norm (an) ally I don ' t mind remakes. There a...,0,1
...,...,...,...,...,...
9972,Jon Stewart (aka John Liebowitz) constantly ri...,0,Jon Stewart (aka John Liebowitz) constantly ri...,0,1
9975,This movie just might make you cooooo. The fil...,1,Movie just make cooooo. The film was WELL wort...,1,0
9977,This movie plays out like an English version o...,0,This plays out like an English version of an A...,0,1
9992,"Only saw this show a few times, but will live ...",1,"Only saw this show a few times, but will live ...",1,0


In [None]:
test = combined_data['Changed_text'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})
mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"Yes, MTV there really is a way to market Daria...",0,1
1,The story of bride fair is amusing and engagin...,0,1
2,This was a popular movie probably because of t...,0,1
3,Interferencia starts as unemployed Martin Sand...,0,1
4,Little seems to have mastered art of having th...,0,1
...,...,...,...
1491,I must say that I was disapointed with this fi...,0,1
1492,"Okay, first of I got this movie a Christmas pr...",1,0
1493,My main comment on this movie is how Zwick was...,0,1
1494,This is another Bollywood remake of a Hollywoo...,0,1


In [None]:
from sklearn.metrics import classification_report
target_names = ['neg', 'pos']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

         neg       0.86      0.84      0.85      5000
         pos       0.85      0.86      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



## Sentence suffiling

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
#import nlpaug.augmenter.sentence as nas
import random
import nltk


# Create an instance of the augmenter class
data = test_df.copy()
tokenizer = nltk.tokenize.sent_tokenize
sen_aug = naw.RandomWordAug(action="swap", tokenizer=tokenizer)
lis = []
for index, row in data.iterrows():
    text = sen_aug.augment(row['text'], n=1)
    lis.extend(list(zip(text)))
tf = pd.DataFrame(lis, columns=['Changed_text'])

data = data.reset_index(drop=True)
tf = tf.reset_index(drop=True)

combined_data = pd.concat([data, tf], axis=1)
combined_data

Unnamed: 0,text,category,Changed_text
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria..."
1,The story of the bride fair is an amusing and ...,0,"It is weird, though, to find an independent mo..."
2,"A team varied between Scully and Mulder, two o...",1,"A team varied between Scully and Mulder, two o..."
3,This was a popular movie probably because of t...,0,this was a popular movie probably because of t...
4,This movie made me so angry!! Here I am thinki...,0,Here I am thinking that here's a new horror mo...
...,...,...,...
9995,SILVER CITY (2+ outta 5 stars) As a huge fan o...,0,"That said, the movie isn't exactly *terrible*...."
9996,Moscow Zero stole my money and I want it back!...,0,"This is a horror movie, not thriller, not susp..."
9997,This is the only film I've seen that is made b...,0,But I still wanted to watch it because I'm a h...
9998,"This is a story about Shin-ae, who moves to Mi...",1,"This is a story about Shin-ae, who moves to Mi..."


In [None]:
test = combined_data['Changed_text'].tolist()
org = combined_data['text'].tolist()
labels = combined_data['category'].tolist()

combined_data['Changed_text_labels'], _ = m.predict(test)
combined_data['text_labels'], _ = m.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['Changed_text_labels'], combined_data['text_labels'])

print("similarity in predection score:",sim)
combined_data[combined_data['Changed_text_labels'] != combined_data['text_labels']]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

similarity in predection score: 0.9426


Unnamed: 0,text,category,Changed_text,Changed_text_labels,text_labels
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria...",1,0
3,This was a popular movie probably because of t...,0,this was a popular movie probably because of t...,0,1
6,"I will stat of with the plot Alice, having sur...",0,"I will stat of with the plot Alice, having sur...",0,1
18,The film listed here as having been made in 19...,0,The film listed here as having been made in 19...,1,0
22,"This was not a very good movie, the acting pre...",0,But there were some funny moments but most of ...,0,1
...,...,...,...,...,...
9879,Much more than ANY other film from that period...,1,It's undemanding fun with loads of nasty make-...,1,0
9913,I watched the entire movie recognizing the par...,0,I watched the entire movie recognizing the par...,1,0
9974,WHITE FIRE was recommended to me by a guy who ...,1,I generally don't watch movies knowing that th...,0,1
9992,"Only saw this show a few times, but will live ...",1,"Only saw this show a few times, but will live ...",1,0


In [None]:
test = combined_data['Changed_text'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})
mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"Yes, MTV there really is a way to market Daria...",0,1
1,"It is weird, though, to find an independent mo...",0,1
2,There is a uk edition to this show which is ra...,0,1
3,Interferencia starts as unemployed Martin Sand...,0,1
4,Little Quentin seems to have mastered the art ...,0,1
...,...,...,...
1494,This movie plays out like an English version o...,0,1
1495,FIRST - This movie was meant to be in stereosc...,1,0
1496,My main comment on this movie is how Zwick was...,0,1
1497,This is another Bollywood remake of a Hollywoo...,0,1


In [None]:
from sklearn.metrics import classification_report
target_names = ['neg','pos']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

         neg       0.86      0.84      0.85      5000
         pos       0.84      0.86      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



naw.RandomWordAug(action='delete')

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.sentence as nas
import random

data = test_df.copy()
w_aug = naw.RandomWordAug(action='delete')
lis = []
for index, row in data.iterrows():
    text = w_aug.augment(row['text'], n=1)
    lis.extend(list(zip(text)))
tf = pd.DataFrame(lis, columns=['Changed_text'])

data = data.reset_index(drop=True)
tf = tf.reset_index(drop=True)

combined_data = pd.concat([data, tf], axis=1)
combined_data

Unnamed: 0,text,category,Changed_text
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria..."
1,The story of the bride fair is an amusing and ...,0,The story of bride fair is amusing and engagin...
2,"A team varied between Scully and Mulder, two o...",1,"A team varied between Scully and Mulder, two o..."
3,This was a popular movie probably because of t...,0,This was a popular movie probably because of t...
4,This movie made me so angry!! Here I am thinki...,0,Movie made me so! ! Here I am thinking that he...
...,...,...,...
9995,SILVER CITY (2+ outta 5 stars) As a huge fan o...,0,SILVER CITY (2 + outta 5 stars) As a huge fan ...
9996,Moscow Zero stole my money and I want it back!...,0,Moscow Zero stole my money and I want it back!...
9997,This is the only film I've seen that is made b...,0,This is the only film I ' ve seen that is made...
9998,"This is a story about Shin-ae, who moves to Mi...",1,"This is a story about Shin -, who moves to Mil..."


In [38]:
test = combined_data['Changed_text'].tolist()
org = combined_data['text'].tolist()
labels = combined_data['category'].tolist()

combined_data['Changed_text_labels'], _ = m.predict(test)
combined_data['text_labels'], _ = m.predict(org)


from sklearn.metrics import accuracy_score
sim = accuracy_score(combined_data['Changed_text_labels'], combined_data['text_labels'])

print("similarity in predection score:",sim)
combined_data[combined_data['Changed_text_labels'] != combined_data['text_labels']]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

similarity in predection score: 0.9426


Unnamed: 0,text,category,Changed_text,Changed_text_labels,text_labels
0,"Yes, MTV there really is a way to market Daria...",0,"Yes, MTV there really is a way to market Daria...",1,0
3,This was a popular movie probably because of t...,0,this was a popular movie probably because of t...,0,1
6,"I will stat of with the plot Alice, having sur...",0,"I will stat of with the plot Alice, having sur...",0,1
18,The film listed here as having been made in 19...,0,The film listed here as having been made in 19...,1,0
22,"This was not a very good movie, the acting pre...",0,But there were some funny moments but most of ...,0,1
...,...,...,...,...,...
9879,Much more than ANY other film from that period...,1,It's undemanding fun with loads of nasty make-...,1,0
9913,I watched the entire movie recognizing the par...,0,I watched the entire movie recognizing the par...,1,0
9974,WHITE FIRE was recommended to me by a guy who ...,1,I generally don't watch movies knowing that th...,0,1
9992,"Only saw this show a few times, but will live ...",1,"Only saw this show a few times, but will live ...",1,0


In [39]:
test = combined_data['Changed_text'].tolist()
labels = combined_data['category'].tolist()
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})
mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"Yes, MTV there really is a way to market Daria...",0,1
1,"It is weird, though, to find an independent mo...",0,1
2,There is a uk edition to this show which is ra...,0,1
3,Interferencia starts as unemployed Martin Sand...,0,1
4,Little Quentin seems to have mastered the art ...,0,1
...,...,...,...
1494,This movie plays out like an English version o...,0,1
1495,FIRST - This movie was meant to be in stereosc...,1,0
1496,My main comment on this movie is how Zwick was...,0,1
1497,This is another Bollywood remake of a Hollywoo...,0,1


In [40]:
from sklearn.metrics import classification_report
target_names = ['neg', 'pos']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

         neg       0.86      0.84      0.85      5000
         pos       0.84      0.86      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [41]:
!pip install emoji
import emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234911 sha256=406e12cc14993aef36cffc4963f0c259b4438b99589a84a68305a6bfd089038b
  Stored in directory: /root/.cache/pip/wheels/02/3d/88/51a592b9ad17e7899126563698b4e3961983ebe85747228ba6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.2.0


testing on lgbt dataset

In [45]:
import pandas as pd
e_test=pd.read_csv('/content/eng_3_test.tsv', delimiter='\t')
e_test = e_test.rename(columns={'text                        ': 'text'})
#e_test=e_test[["text", "category"]] 
e_test = e_test.dropna()
test_df = e_test
test_df['category'] = test_df['category'].replace('Non-anti-LGBT+ content', 'ally')
test_df['category'] = test_df['category'].replace('ally', 1)
test_df['category'] = test_df['category'].replace('Homophobic', 0)
test_df['category'] = test_df['category'].replace('Transphobic', 0)
test_df=test_df[["text", "category"]]
test_df = test_df.rename(columns={'text': 'text', 'category': 'labels'})
test_df['text'] = test_df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))

In [43]:
test = test_df['text'].tolist()
labels = test_df['labels'].tolist()
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,"For those who speak about culture., pre marita...",1,0
1,99% tamilans cant understand r accept it becau...,1,0
2,Really this prank will be more powerful Feedba...,1,0
3,She has a point,1,0
4,Please waiting for part 2 paa ...,1,0
...,...,...,...
379,She is innocent nd cute,1,0
380,Worst thirunangai idiot che go way ...,1,0
381,Hats off handshake .... Semma semma finally t...,1,0
382,Looks like Karthik took advantage on Varun whe...,1,0


In [44]:
from sklearn.metrics import classification_report
target_names = ['0', '1']
print(classification_report(labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

           0       0.11      0.70      0.19        66
           1       0.97      0.61      0.74       924

    accuracy                           0.61       990
   macro avg       0.54      0.65      0.47       990
weighted avg       0.91      0.61      0.71       990



In [None]:
#Multi lingual analysis

In [None]:
sentences = [
    "I hate spending time with my family.",
    "Je suis heureux de passer du temps avec mes amis.",
    "Me encanta la playa y el sol.",
    "Ich fühle mich entspannt und glücklich im Wald.",
    "Sono felice quando sto ballando.",
    "I'm so sad that my dog passed away.",
    "Je suis déçu de ne pas avoir obtenu le poste.",
    "No me gusta la lluvia y el frío.",
    "Ich habe schlechte Laune, weil ich den Bus verpasst habe.",
    "Mi sento triste quando penso a ciò che è successo."
]
labels = [0, 1, 1, 1, 1, 0, 0, 0, 0, 0]



In [None]:
test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,I'm so sad that my dog passed away.,0,1
1,Mi sento triste quando penso a ciò che è succe...,0,1


In [None]:
predictions

array([0, 1, 1, 1, 1, 1, 0, 0, 0, 1])

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install checklist

In [None]:
dataset = ['This was a very nice movie directed by John Smith.',
           'Mary Keen was brilliant.',
          'I hated everything about this.',
          'This movie was very bad.',
          'I really liked this movie.',
          'just bad.',
          'amazing.',
          ]
nlp = spacy.load('en_core_web_sm')
pdataset = list(nlp.pipe(dataset))
ret = Perturb.perturb(pdataset, Perturb.add_negation)
ret.data

[['This was a very nice movie directed by John Smith.',
  'This was not a very nice movie directed by John Smith.'],
 ['Mary Keen was brilliant.', 'Mary Keen was not brilliant.'],
 ['I hated everything about this.', "I didn't hate everything about this."],
 ['This movie was very bad.', 'This movie was not very bad.'],
 ['I really liked this movie.', "I really didn't like this movie."]]

In [None]:


sentences = [
    "मैं अपने परिवार के साथ बिताने का समय प्यार करता हूँ।",
    "मुझे अपने दोस्तों के साथ घूमना अच्छा लगता है।",
    "आज का दिन मेरे लिए बहुत बुरा था।"
]
labels = [1, 1, 0]

test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,आज का दिन मेरे लिए बहुत बुरा था।,0,1


In [None]:
"I love spending time with my family."
"I enjoy hanging out with my friends."
"Today was a very bad day for me."

'Today was a very bad day for me.'

In [None]:


sentences = [
    "నేను నా కుటుంబంతో సమయం కళ్ళుకోలుకునేవాడు.",
    "నాకు నా స్నేహితులతో సమయం కళ్ళుకోలుకునేవాడు.",
    "నాకు కన్నీటి మరియు చలితం నచ్చదు."
]
labels = [1, 1, 0]


test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
"I am someone who spends time with my family."
"I am someone who spends time with my friends."
"I don't like noise and movement."

"I don't like noise and movement."

In [None]:
sentences = [
    "எனக்கு என் குடும்பத்துடன் நேரம் காட்ட நல்லது.",
    "எனக்கு என் நண்பர்களுடன் வேலை செய்யும் நேரம் நல்லது.",
    "இன்று எனக்கு பெரிய தீங்கு உள்ளது."
]
labels = [1, 1, 0]

test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,இன்று எனக்கு பெரிய தீங்கு உள்ளது.,0,1


In [None]:
"It's good for me to spend time with my family."
"It's good for me to work with my friends."
"Today I feel very unlucky."

'Today I feel very unlucky.'

In [None]:
sentences = [
    "എനിക്ക് എന്റെ കുടുംബത്തോട് സമയം കാണാന്‍ നല്ലതാണ്.",
    "എനിക്ക് എന്റെ സൗഹൃദങ്ങളോട് ജോലി ചെയ്യാന്‍ നല്ലതാണ്.",
    "ഇന്ന് എനിക്ക് വളരെ മോശമായ തെറ്റ് ഉണ്ടായിരിക്കുന്നു."
]
labels = [1, 1, 0]

test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,എനിക്ക് എന്റെ കുടുംബത്തോട് സമയം കാണാന്‍ നല്ലതാണ്.,1,0
1,എനിക്ക് എന്റെ സൗഹൃദങ്ങളോട് ജോലി ചെയ്യാന്‍ നല്ല...,1,0


In [None]:
"It's good for me to spend time with my family."
"It's good for me to work with my friends."
"Today I'm making a lot of mistakes."

"Today I'm making a lot of mistakes."

In [None]:
#Robust ness

In [None]:
sentences = [
    "I didn't not enjoy the movie.",
    "I can't believe how terrible that meal was.",
    "It's not like I hate the product, but it's just not for me.",
    "The customer service was so bad, I'm never going back to that store.",
    "I have mixed feelings about this restaurant; the food was great, but the service was terrible.",
    "I'm not sure if I liked the book or not; it was well-written but the story was depressing.",
    "I'm really happy with the product, but the delivery was delayed by a week.",
    "I had a great time at the party, but the music was way too loud."
]

labels = [0, 0, 0, 0, 0, 1, 1, 1]

test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,I'm not sure if I liked the book or not; it wa...,1,0
1,"I'm really happy with the product, but the del...",1,0
2,"I had a great time at the party, but the music...",1,0


In [None]:
sentences = [
    "I didint not enjoy the muvie.",
    "I cant belive how terrable that meel was.",
    "Its not like I hat the product, but it just not for me.",
    "The costumer servise was so bad, Im never going back to that stoor.",
    "I have mixxed feelings abot this restront; the food was great, but the servise was terible.",
    "Im not sure if I liked the boook or not; it was wel-writen but the story was depresing.",
    "Im really happy with the product, but the delivry was delaied by a weak.",
    "I had a great time at the pary, but the music was way too loud."
]
labels = [0, 1, 0, 0, 1, 0, 1, 1]

test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,I have mixxed feelings abot this restront; the...,1,0
1,"Im really happy with the product, but the deli...",1,0
2,"I had a great time at the pary, but the music ...",1,0


In [None]:
sentences = [
    "I didn&#x27;t not enjoy the movie.",
    "I can&#x27;t believe how <strong>terrible</strong> that meal was.",
    "It&#x27;s not like I <emph>hate</emph> the product, but it&#x27;s just not for me.",
    "The <u>customer service</u> was so bad, I&#x27;m never going back to that store.",
    "I have mixed feelings about this <a href='#'>restaurant</a>; the food was great, but the service was terrible.",
    "I&#x27;m not sure if I liked the <i>book</i> or not; it was well-written but the story was depressing.",
    "I&#x27;m really <span style='color: green'>happy</span> with the product, but the delivery was delayed by a week.",
    "I had a great time at the <b>party</b>, but the music was way too loud."
]


labels = [0, 0, 0, 0, 0, 1, 1, 1]


test = sentences
labels = labels
predictions, _ = m.predict(test)

misclassified_records = []

for i in range(len(test)):
    if predictions[i] != labels[i]:
        misclassified_records.append({'text': test[i], 'true_label': labels[i], 'predicted_label': predictions[i]})

mis = pd.DataFrame(misclassified_records)
mis

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,true_label,predicted_label
0,I&#x27;m not sure if I liked the <i>book</i> o...,1,0
1,I&#x27;m really <span style='color: green'>hap...,1,0
2,"I had a great time at the <b>party</b>, but th...",1,0
