In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm 
from sklearn.model_selection import train_test_split

import torch
import spacy
from torch import nn, optim
from transformers import pipeline
from functools import partial

nlp = spacy.load("en_core_web_trf")



In [2]:
imdb_data=pd.read_csv('./data/processed_train.csv')
print(imdb_data.shape)
imdb_data.head(5)

(29341, 4)


Unnamed: 0,ID,review,sentiment,processed_review
0,41411,I watched this film because I'm a big fan of R...,0,watch film m big fan river phoenix joaquin pho...
1,37586,It does not seem that this movie managed to pl...,1,movie manage lot people see place bump acciden...
2,6017,"Enough is not a bad movie , just mediocre .",0,bad movie mediocre
3,44656,my friend and i rented this one a few nights a...,0,friend rent night ago single good movie see me...
4,38711,"Just about everything in this movie is wrong, ...",0,movie wrong wrong wrong mike myers example s r...


In [3]:
imdb_test_data=pd.read_csv('./data/processed_test.csv')
print(imdb_test_data.shape)
imdb_test_data.head(5)

(29341, 3)


Unnamed: 0,ID,review,processed_review
0,22622,Robert Lansing plays a scientist experimenting...,robert lansing play scientist experiment pass ...
1,10162,"Well I've enjoy this movie, even though someti...",ve enjoy movie turn stereotypical situation nt...
2,17468,First things first - though I believe Joel Sch...,thing believe joel schumacher well mediocre di...
3,42579,I watched this movie on the grounds that Amber...,watch movie ground amber benson rock nick stah...
4,701,A certain sexiness underlines even the dullest...,certain sexiness underline dull tangent


In [4]:
MAX_LEN = 512

In [5]:
all_review = list(map(str, np.concatenate([imdb_data.processed_review.values, imdb_test_data.processed_review.values])))
train_review = list(map(str, imdb_data.processed_review.values))
test_review = list(map(str, imdb_test_data.processed_review.values))

#### produce column : pruned_review

if input documents are longer than 512 tokens, sentiment analysis pipeline is not out of box. so, we need to prune reviews first. 

Our preprocessing pipeline : tokenize -> remove stopwords -> choose words where pos like adj, adv, propn etc., these words are representative.


In [6]:
def prune_text(text, MAX_LEN):
    doc = nlp(text)
    lst = []
    for token in doc:
        # print(token, token.pos_)
        if token.pos_ in ('ADJ','ADV','CONJ','CCONJ','PART','PROPN'):
            lst.append(token.orth_)
    if len(lst) > MAX_LEN:
        lst = lst[:MAX_LEN]

    return " ".join(lst)

# processed_text = list(map(partial(prune_text, MAX_LEN=MAX_LEN), list(train_review)))
# imdb_data['pruned_review'] = processed_text
# imdb_data.to_csv('./data/processed_train.csv',index=0)

# processed_text = list(map(partial(prune_text, MAX_LEN=MAX_LEN), list(test_review)))
# imdb_test_data['pruned_review'] = processed_text
# imdb_data.to_csv('./data/processed_test.csv',index=0)

In [7]:
# https://towardsdatascience.com/sentiment-analysis-with-pretrained-transformers-using-pytorch-420bbc1a48cd
sentiment_analysis = pipeline("sentiment-analysis")

In [8]:
classifier = pipeline('sentiment-analysis')
results = classifier(["We are very happy to show you the 🤗 Transformers library.",
           "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309


In [9]:
s = "I'm extremely excited but today weather is not good."
print(s)
sentiment_analysis(s)[0]

I'm extremely excited but today weather is not good.


{'label': 'NEGATIVE', 'score': 0.9990981817245483}

In [10]:
s1 = prune_text(s, MAX_LEN=MAX_LEN)
print(s1)
sentiment_analysis(s1)[0]

extremely excited but not good


{'label': 'NEGATIVE', 'score': 0.9929606914520264}

In [11]:
imdb_data['processed_review'] = imdb_data['processed_review'].astype(str)

In [12]:
pruned = []
for txt in tqdm(imdb_data['processed_review']):
    pruned.append(prune_text(txt, MAX_LEN=MAX_LEN))
    

100%|██████████| 29341/29341 [1:08:54<00:00,  7.10it/s]


In [13]:
imdb_data['pruned_review'] = pruned

In [14]:
imdb_data

Unnamed: 0,ID,review,sentiment,processed_review,pruned_review
0,41411,I watched this film because I'm a big fan of R...,0,watch film m big fan river phoenix joaquin pho...,river phoenix joaquin phoenix embarrasing weir...
1,37586,It does not seem that this movie managed to pl...,1,movie manage lot people see place bump acciden...,usually great crazy pretty mr jones different ...
2,6017,"Enough is not a bad movie , just mediocre .",0,bad movie mediocre,bad mediocre
3,44656,my friend and i rented this one a few nights a...,0,friend rent night ago single good movie see me...,ago single good close wonderfuly original usua...
4,38711,"Just about everything in this movie is wrong, ...",0,movie wrong wrong wrong mike myers example s r...,wrong wrong myers ago cutesy painfully unfunny...
...,...,...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1,s honest film hollywood,honest hollywood
29337,453,An absorbing and unsettling psychological drama .,1,absorb unsettling psychological drama,absorb unsettling psychological
29338,13097,"Soylent Green IS...a really good movie, actual...",1,soylent green good movie actually ve think don...,soylent green good actually heston sneery invi...
29339,26896,There just isn't enough here. There a few funn...,0,isn funny spot disappoint love stupid movie ex...,isn funny stupid good loud maybe funny flat ma...


In [15]:
imdb_data.to_csv('./data/pruned_train.csv', index=0)

In [17]:
imdb_data['pruned_review'] = imdb_data['pruned_review'].astype(str)

In [26]:
result = []
score = []
for txt in tqdm(imdb_data['pruned_review']):
    result.append('0' if sentiment_analysis(txt)[0]['label'] == 'NEGATIVE' else '1')
    score.append(sentiment_analysis(txt)[0]['score'])

100%|██████████| 29341/29341 [25:15<00:00, 19.36it/s]


In [27]:
imdb_data['pred_sentiment'] = result
imdb_data['score'] = score

Unnamed: 0,ID,review,sentiment,processed_review,pruned_review,pred_sentiment,score
0,41411,I watched this film because I'm a big fan of R...,0,watch film m big fan river phoenix joaquin pho...,river phoenix joaquin phoenix embarrasing weir...,0,0.984501
1,37586,It does not seem that this movie managed to pl...,1,movie manage lot people see place bump acciden...,usually great crazy pretty mr jones different ...,1,0.999685
2,6017,"Enough is not a bad movie , just mediocre .",0,bad movie mediocre,bad mediocre,0,0.999798
3,44656,my friend and i rented this one a few nights a...,0,friend rent night ago single good movie see me...,ago single good close wonderfuly original usua...,1,0.999860
4,38711,"Just about everything in this movie is wrong, ...",0,movie wrong wrong wrong mike myers example s r...,wrong wrong myers ago cutesy painfully unfunny...,0,0.973535
...,...,...,...,...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1,s honest film hollywood,honest hollywood,1,0.999783
29337,453,An absorbing and unsettling psychological drama .,1,absorb unsettling psychological drama,absorb unsettling psychological,0,0.854891
29338,13097,"Soylent Green IS...a really good movie, actual...",1,soylent green good movie actually ve think don...,soylent green good actually heston sneery invi...,1,0.997482
29339,26896,There just isn't enough here. There a few funn...,0,isn funny spot disappoint love stupid movie ex...,isn funny stupid good loud maybe funny flat ma...,0,0.997078


In [39]:
# filtered weird sentiment data
imdb_data = imdb_data[~imdb_data['sentiment'].str.contains('[A-Za-z]')]
imdb_data

Unnamed: 0,ID,review,sentiment,processed_review,pruned_review,pred_sentiment,score
0,41411,I watched this film because I'm a big fan of R...,0,watch film m big fan river phoenix joaquin pho...,river phoenix joaquin phoenix embarrasing weir...,0,0.984501
1,37586,It does not seem that this movie managed to pl...,1,movie manage lot people see place bump acciden...,usually great crazy pretty mr jones different ...,1,0.999685
2,6017,"Enough is not a bad movie , just mediocre .",0,bad movie mediocre,bad mediocre,0,0.999798
3,44656,my friend and i rented this one a few nights a...,0,friend rent night ago single good movie see me...,ago single good close wonderfuly original usua...,1,0.999860
4,38711,"Just about everything in this movie is wrong, ...",0,movie wrong wrong wrong mike myers example s r...,wrong wrong myers ago cutesy painfully unfunny...,0,0.973535
...,...,...,...,...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1,s honest film hollywood,honest hollywood,1,0.999783
29337,453,An absorbing and unsettling psychological drama .,1,absorb unsettling psychological drama,absorb unsettling psychological,0,0.854891
29338,13097,"Soylent Green IS...a really good movie, actual...",1,soylent green good movie actually ve think don...,soylent green good actually heston sneery invi...,1,0.997482
29339,26896,There just isn't enough here. There a few funn...,0,isn funny spot disappoint love stupid movie ex...,isn funny stupid good loud maybe funny flat ma...,0,0.997078


In [40]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(imdb_data['sentiment'], imdb_data['pred_sentiment']))

0.7703726453726454


In [42]:
print(classification_report(imdb_data['sentiment'], imdb_data['pred_sentiment']))


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -)       0.00      0.00      0.00         1
           0       0.80      0.71      0.75     14368
           1       0.75      0.83      0.79     14935

    accuracy                           0.77     29304
   macro avg       0.52      0.51      0.51     29304
weighted avg       0.77      0.77      0.77     29304



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
imdb_data.to_csv('data/pruned_review.csv', index=0)

In [45]:
%%time

imdb_test_data['processed_review'] = imdb_test_data['processed_review'].astype(str)

pruned = []
for txt in tqdm(imdb_test_data['processed_review']):
    pruned.append(prune_text(txt, MAX_LEN=MAX_LEN))
    

100%|██████████| 29341/29341 [22:42:45<00:00,  2.79s/it]

CPU times: user 6h 7min 18s, sys: 37.3 s, total: 6h 7min 56s
Wall time: 22h 42min 45s





In [46]:
imdb_test_data['pruned_review'] = pruned

In [47]:
%%time

imdb_test_data['pruned_review'] = imdb_test_data['pruned_review'].astype(str)

result = []
score = []
for txt in tqdm(imdb_test_data['pruned_review']):
    result.append('0' if sentiment_analysis(txt)[0]['label'] == 'NEGATIVE' else '1')
    score.append(sentiment_analysis(txt)[0]['score'])

100%|██████████| 29341/29341 [36:42<00:00, 13.32it/s]

CPU times: user 2h 24min 23s, sys: 22.9 s, total: 2h 24min 46s
Wall time: 36min 42s





In [48]:
imdb_test_data['pred_sentiment'] = result
imdb_test_data['score'] = score
imdb_test_data

Unnamed: 0,ID,review,processed_review,pruned_review,pred_sentiment,score
0,22622,Robert Lansing plays a scientist experimenting...,robert lansing play scientist experiment pass ...,robert lansing solid far unintentionally dimen...,1,0.996732
1,10162,"Well I've enjoy this movie, even though someti...",ve enjoy movie turn stereotypical situation nt...,stereotypical nt nt different finally maybe in...,0,0.983873
2,17468,First things first - though I believe Joel Sch...,thing believe joel schumacher well mediocre di...,joel schumacher mediocre downright bad ugly jo...,0,0.996279
3,42579,I watched this movie on the grounds that Amber...,watch movie ground amber benson rock nick stah...,amber nick stahl generally pretty cool bad rea...,0,0.997871
4,701,A certain sexiness underlines even the dullest...,certain sexiness underline dull tangent,certain dull,0,0.999785
...,...,...,...,...,...,...
29336,30370,It is difficult to rate a writer/director's fi...,difficult rate writer director s effort movie ...,difficult s direct james uneven straight commo...,0,0.528997
29337,18654,"After watching this movie once, it quickly bec...",watch movie quickly favorite different event h...,quickly favorite different,1,0.997787
29338,47985,"Even though i sat and watched the whole thing,...",sit watch thing good place big chunk informati...,good big inaccurate michael nt current dramati...,1,0.959790
29339,9866,Warning Spoilers following. Superb recreation ...,warn spoiler follow superb recreation base ant...,superb antarctica real libelous scandalous can...,0,0.879290


In [49]:
imdb_test_data.to_csv('pruned_test.csv', index=0)

In [51]:
submission = imdb_test_data[['ID', 'pred_sentiment']]
submission.rename({'pred_sentiment':'sentiment'}, axis=1,inplace=True)
submission.to_csv('./data/baseline_submission.csv', index=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [52]:
submission

Unnamed: 0,ID,sentiment
0,22622,1
1,10162,0
2,17468,0
3,42579,0
4,701,0
...,...,...
29336,30370,0
29337,18654,1
29338,47985,1
29339,9866,0
