## Data Loading

In [1]:
import pandas as pd

In [2]:
movie_char = pd.read_csv('movie100.csv')
review = pd.read_csv('Movie_Reviews_100.csv')

In [3]:
movie_char.head()

Unnamed: 0.1,Unnamed: 0,id,title,year,length,genre,rating
0,0,tt0111161,The Shawshank Redemption,1994,2h 22min,Drama,9.3
1,1,tt0068646,The Godfather,1972,2h 55min,Crime,9.2
2,2,tt0071562,The Godfather: Part II,1974,3h 22min,Crime,9.0
3,3,tt0468569,The Dark Knight,2008,2h 32min,Action,9.0
4,4,tt0050083,12 Angry Men,1957,1h 36min,Crime,8.9


In [4]:
review.head()

Unnamed: 0,Data of Review,Rating (Out of 10),Review,id
0,2020-07-21 00:00:00,10,"A number of giant, career-making performances ...",tt0172495
1,2020-07-21 00:00:00,10,This is one of the best films seen in my life....,tt0172495
2,2020-07-13 00:00:00,9,With the unbeliveable acting and production de...,tt0172495
3,2014-04-27 00:00:00,10,You have to watch this movie at least 3 times ...,tt0172495
4,2002-03-29 00:00:00,7,There is much to appreciate about about this p...,tt0172495


## Emotion Analysis

NRCLexicon is an MIT-approved pypi project by Mark M. Bailey which predicts the sentiments and emotion of a given text. The package contains approximately 27,000 words and is based on the National Research Council Canada (NRC) affect lexicon and the NLTK library’s WordNet synonym sets.

Emotional affects measured include the following:
- fear
- anger
- anticipation
- trust
- surprise
- positive
- negative
- sadness
- disgust
- joy

More information: https://www.geeksforgeeks.org/emotion-classification-using-nrc-lexicon-in-python/

In [5]:
from nrclex import NRCLex
review['nrc'] = review['Review'].apply(lambda x: NRCLex(x))

In [6]:
#extract frequency of emotions from reviews
review['fear'] = review['nrc'].apply(lambda x: x.affect_frequencies['fear'])
review['anger'] = review['nrc'].apply(lambda x: x.affect_frequencies['anger'])
review['trust'] = review['nrc'].apply(lambda x: x.affect_frequencies['trust'])
review['surprise'] = review['nrc'].apply(lambda x: x.affect_frequencies['surprise'])
review['positive'] = review['nrc'].apply(lambda x: x.affect_frequencies['positive'])
review['negative'] = review['nrc'].apply(lambda x: x.affect_frequencies['negative'])
review['sadness'] = review['nrc'].apply(lambda x: x.affect_frequencies['sadness'])
review['disgust'] = review['nrc'].apply(lambda x: x.affect_frequencies['disgust'])
review['joy'] = review['nrc'].apply(lambda x: x.affect_frequencies['joy'])
review['anticipation'] = review['nrc'].apply(lambda x: list(x.affect_frequencies.values())[-1])

In [7]:
#extract top emotions from reviews
review['top emotions'] = review['nrc'].apply(lambda x: x.top_emotions)

In [8]:
review['top emotions'] = review['top emotions'].apply(lambda x: [e[0] for e in x])

In [10]:
review['top emotions']

0                          [positive]
1         [fear, trust, anticipation]
2                          [positive]
3                      [anticipation]
4                          [positive]
                     ...             
127644                     [positive]
127645                     [positive]
127646                 [anticipation]
127647                     [positive]
127648                     [positive]
Name: top emotions, Length: 127649, dtype: object

In [14]:
#clean reviews

##tokenization and remove puncuation
review['cleaned review'] = review['Review'].str.replace(r'([^\w\s]+)|(\n)+', ' ').str.lower().str.split()

##remove stopwords
from nltk.corpus import stopwords
review['cleaned review'] = review['cleaned review'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])


## Visualization of Emotions by Movie Genre - Worldcloud

In [88]:
import stylecloud

In [145]:
def wc(file_name, text):
    name= file_name + '.txt'
    
    with open(name, 'w') as filehandle:
        filehandle.writelines("%s\n" % " ".join(word) for word in text)

    stylecloud.gen_stylecloud(
                          file_path=name,
                          icon_name = "fas fa-video",
                          size =1028,
                          palette='tableau.Gray_5',
                          background_color ='white',output_name = file_name +'.png'
)

In [19]:
#merge two datasets to get genre
review2 = review.merge(movie_char[['id','genre']]) 

In [98]:
#further data cleaning for wordcloud
#Part of Speech techinique
import spacy
nlp = spacy.load('en')

def pos2(text): #function that only preserves noun in the review
    newtxt =[]
    doc = nlp(text)
    for t in doc:
        if t.pos_ =='NOUN': 
            newtxt.append(t.lemma_)
    return newtxt

In [130]:
def remove_movie(x): # function that remove "stop words"
    if 'film' in x:
        x = list(filter(('film').__ne__, x))
    if 'movie' in x:
        x = list(filter(('movie').__ne__, x))
    if 'people' in x:
        x = list(filter(('people').__ne__, x))
    if 'thing' in x:
        x = list(filter(('thing').__ne__, x))
    if 'time' in x:
        x = list(filter(('time').__ne__, x))
    if 'year' in x:
        x = list(filter(('year').__ne__, x))
    if 'story' in x:
        x = list(filter(('story').__ne__, x))
    if 'character' in x:
        x = list(filter(('character').__ne__, x)) 
    if 'part' in x:
        x = list(filter(('part').__ne__, x))
    if 'plot' in x:
        x = list(filter(('plot').__ne__, x))
    if 'way' in x:
        x = list(filter(('way').__ne__, x))
    if 'scene' in x:
        x = list(filter(('scene').__ne__, x))
    if 'performance' in x:
        x = list(filter(('performance').__ne__, x))
    if 'actor' in x:
        x = list(filter(('actor').__ne__, x))
    return x

## Drama

In [56]:
drama = review2.loc[review2['genre']=='Drama',['cleaned review','top emotions']]
drama 

Unnamed: 0,cleaned review,top emotions
13853,"['seldomly', 'rate', 'movies', 'one', 'star', ...",['positive']
13854,"['beloved', 'film', 'many', 'afraid', 'thing',...","['trust', 'positive']"
13855,"['people', 'say', 'like', 'eve', 'end', 'borin...","['positive', 'negative']"
13856,"['doesen', 'get', 'better', 'eve', 'incredibly...",['positive']
13857,"['ever', 'film', 'destined', 'critics', 'grove...",['positive']
...,...,...
121484,"['7', 'shining', 'horror', '1980', 'jack', 'ja...",['positive']
121485,"['combination', 'legendary', 'stanley', 'kubri...",['positive']
121486,"['generally', 'like', 'horror', 'movies', 'one...",['positive']
121487,"['felt', 'comment', 'film', 'think', 'great', ...",['positive']


In [101]:
%%time
#further clean review
drama['noun'] =  drama['cleaned review'].apply(lambda x: pos2(x))

CPU times: user 16min 27s, sys: 1min 23s, total: 17min 50s
Wall time: 17min 52s


In [131]:
#further clean review
drama['noun2'] = drama['noun'].apply(remove_movie)

In [132]:
#extract emotions
drama_positive= drama.loc[drama['top emotions'].apply(lambda x: 'positive' in x),'noun2']
drama_negative= drama.loc[drama['top emotions'].apply(lambda x: 'negative' in x),'noun2']
drama_anticipation= drama.loc[drama['top emotions'].apply(lambda x: 'anticipation' in x),'noun2']
drama_trust= drama.loc[drama['top emotions'].apply(lambda x: 'trust' in x),'noun2']
drama_fear= drama.loc[drama['top emotions'].apply(lambda x: 'fear' in x),'noun2']
drama_joy= drama.loc[drama['top emotions'].apply(lambda x: 'joy' in x),'noun2']
drama_sadness= drama.loc[drama['top emotions'].apply(lambda x: 'sadness' in x),'noun2']
drama_surprise= drama.loc[drama['top emotions'].apply(lambda x: 'surprise' in x),'noun2']
drama_anger= drama.loc[drama['top emotions'].apply(lambda x: 'anger' in x),'noun2']
drama_disgust= drama.loc[drama['top emotions'].apply(lambda x: 'disgust' in x),'noun2']

In [133]:
#create wordcloud images
wc('drama_positive', drama_positive)
wc('drama_negative', drama_negative)
wc('drama_anticipation', drama_anticipation)
wc('drama_trust', drama_trust)
wc('drama_fear', drama_fear)
wc('drama_joy', drama_joy)
wc('drama_sadness', drama_sadness)
wc('drama_surprise', drama_surprise)
wc('drama_anger', drama_anger)
wc('drama_disgust', drama_disgust)

In [55]:
drama['top emotions'].str.strip('[]').str.replace(',',"").str.split().explode().value_counts()

'positive'        23355
'negative'         7365
'anticipation'     4369
'trust'            3654
'fear'             3172
'joy'              2826
'sadness'          1879
'surprise'         1754
'anger'            1640
'disgust'          1309
'anticip'           981
Name: top emotions, dtype: int64

In [34]:
#perform similar process for the other 2 genres
action = review2.loc[review2['genre']=='Action',['cleaned review','top emotions']]
crime = review2.loc[review2['genre']=='Crime',['cleaned review','top emotions']]
comedy = review2.loc[review2['genre']=='Comedy',['cleaned review','top emotions']]

## Action

In [62]:
action['top emotions'].str.strip('[]').str.replace(',',"").str.split().explode().value_counts()

'positive'        33364
'negative'         8194
'anticipation'     6734
'trust'            4786
'joy'              3956
'fear'             3247
'surprise'         2564
'sadness'          2558
'anger'            2424
'disgust'          1950
'anticip'          1471
Name: top emotions, dtype: int64

In [105]:
%%time
action['noun'] =  action['cleaned review'].apply(lambda x: pos2(x))

CPU times: user 22min 5s, sys: 2min, total: 24min 5s
Wall time: 24min 7s


In [134]:
action['noun2'] = action['noun'].apply(remove_movie)

In [135]:
action_positive= action.loc[action['top emotions'].apply(lambda x: 'positive' in x),'noun2']
action_negative= action.loc[action['top emotions'].apply(lambda x: 'negative' in x),'noun2']
action_anticipation= action.loc[action['top emotions'].apply(lambda x: 'anticipation' in x),'noun2']
action_trust= action.loc[action['top emotions'].apply(lambda x: 'trust' in x),'noun2']
action_joy= action.loc[action['top emotions'].apply(lambda x: 'joy' in x),'noun2']
action_fear= action.loc[action['top emotions'].apply(lambda x: 'fear' in x),'noun2']
action_sadness= action.loc[action['top emotions'].apply(lambda x: 'sadness' in x),'noun2']
action_surprise= action.loc[action['top emotions'].apply(lambda x: 'surprise' in x),'noun2']
action_anger= action.loc[action['top emotions'].apply(lambda x: 'anger' in x),'noun2']
action_disgust= action.loc[action['top emotions'].apply(lambda x: 'disgust' in x),'noun2']

In [136]:
wc('action_positive', action_positive)
wc('action_negative', action_negative)
wc('action_anticipation', action_anticipation)
wc('action_trust', action_trust)
wc('action_fear', action_fear)
wc('action_joy', action_joy)
wc('action_sadness', action_sadness)
wc('action_surprise', action_surprise)
wc('action_anger', action_anger)
wc('action_disgust', action_disgust)

## Crime

In [64]:
crime['top emotions'].str.strip('[]').str.replace(',',"").str.split().explode().value_counts()

'positive'        20467
'negative'         7065
'anticipation'     4382
'trust'            3655
'joy'              3186
'fear'             3008
'sadness'          2263
'surprise'         2011
'anger'            1906
'disgust'          1532
'anticip'          1075
Name: top emotions, dtype: int64

In [113]:
%%time
crime['noun'] =  crime['cleaned review'].apply(lambda x: pos2(x))


CPU times: user 12min 51s, sys: 1min 6s, total: 13min 58s
Wall time: 13min 59s


In [137]:
crime['noun2'] = crime['noun'].apply(remove_movie)

In [138]:
crime_positive= crime.loc[crime['top emotions'].apply(lambda x: 'positive' in x),'noun2']
crime_negative= crime.loc[crime['top emotions'].apply(lambda x: 'negative' in x),'noun2']
crime_anticipation= crime.loc[crime['top emotions'].apply(lambda x: 'anticipation' in x),'noun2']
crime_trust= crime.loc[crime['top emotions'].apply(lambda x: 'trust' in x),'noun2']
crime_joy= crime.loc[crime['top emotions'].apply(lambda x: 'joy' in x),'noun2']
crime_fear= crime.loc[crime['top emotions'].apply(lambda x: 'fear' in x),'noun2']
crime_sadness= crime.loc[crime['top emotions'].apply(lambda x: 'sadness' in x),'noun2']
crime_surprise= crime.loc[crime['top emotions'].apply(lambda x: 'surprise' in x),'noun2']
crime_anger= crime.loc[crime['top emotions'].apply(lambda x: 'anger' in x),'noun2']
crime_disgust= crime.loc[crime['top emotions'].apply(lambda x: 'disgust' in x),'noun2']

In [139]:
wc('crime_positive', crime_positive)
wc('crime_negative', crime_negative)
wc('crime_anticipation', crime_anticipation)
wc('crime_trust', crime_trust)
wc('crime_fear', crime_fear)
wc('crime_joy', crime_joy)
wc('crime_sadness', crime_sadness)
wc('crime_surprise', crime_surprise)
wc('crime_anger', crime_anger)
wc('crime_disgust', crime_disgust)