In [145]:
# Loading dataset

import pandas as pd
path = ('singlesinfernos1e1.xlsx')
xl = pd.ExcelFile(path)
df1 = xl.parse('e1')
display(df1)

Unnamed: 0,Time,Subtitle,Speaker,Translation,Interactions
0,12s,(dahui ),,Here are two islands.,
1,,(다희),,,
2,18s,(dahui ),,"On these islands, you focus only on each other...",
3,,(다희),,,
4,22s,(dahui ),,under limited conditions,
...,...,...,...,...,...
2011,,(다희) 그러면 한 분은 0표를 받았다는 거죠,,,
2012,01:00:23,(gyuhyeon ) geuleohjyo,,Yes.,
2013,,(규현) 그렇죠,,,
2014,01:00:59,jamag : baehaneul,,,


In [146]:
#clear all rows that have NaN as either speaker or interactions
df1.dropna(subset=['Speaker', 'Interactions'], how='all',inplace=True)
df1.drop('Subtitle', inplace=True, axis=1)

In [147]:
# removing stopwords using nltk

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop = stopwords.words('english')

df1['Translation'] = df1['Translation'].apply(lambda x: ' '.join([word for word in x.replace('-','').replace('?','').replace('.','').replace(',','').split() if word not in (stop)]))
df1['Translation'] = df1['Translation'].astype(str).str.lower()
df1['Translation'] = df1['Translation'].apply(lambda x: ' '.join([item for item in x.split() if len(item)>3]))

# lemmatize - words into root form
nltk.download('wordnet')
wordnet_lem = WordNetLemmatizer()
df1['Translation'] = df1['Translation'].apply(wordnet_lem.lemmatize)

[nltk_data] Downloading package stopwords to C:\Users\Yue
[nltk_data]     Ning\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Yue
[nltk_data]     Ning\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [148]:
# Personalities & sentiment

In [153]:
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from wordcloud import WordCloud

# need to make interactive functionality to choose the person you want


x,y = np.ogrid[:300, :300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

copy = df1
cast = ['Hyeonjung','Se-hoon','Jun-sik','Si-hun','Jin-taek', 'Ji-yeon','So-yeon','Yea-won','Ji-a']
c = Dropdown(options=cast)

@interact
def choose_person(person=c):
    df2 = copy['Translation'].where(copy['Speaker'] == person)
    df2.dropna(inplace=True)
    all_words_person = ''.join([word for word in df2])

    wordcloud = WordCloud(background_color="white", repeat=True, mask=mask).generate(all_words_person)
    plt.figure(figsize=(10,7))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation='bilinear')

# bar graph to show the sentiments of each person

interactive(children=(Dropdown(description='person', options=('Hyeonjung', 'Se-hoon', 'Jun-sik', 'Si-hun', 'Ji…

In [150]:
# Looking at interactions - match two people; how do they feel about each other? 
# And which two people have the most positive/ negative feelings

df3 = df1.copy(deep=True)
df3[['PersonOne', 'PersonTwo']] = df1['Interactions'].str.split(', ', expand=True)
df3.dropna(subset=['Interactions'], how='all',inplace=True)
print(df3)

                     Time  Speaker                   Translation  \
892   1900-01-01 05:43:00      NaN             seem approachable   
894   1900-01-01 05:46:00      NaN         looks different smile   
898   1900-01-01 05:50:00      NaN  when there's expression face   
900   1900-01-01 05:53:00      NaN         idea thinking feeling   
906   1900-01-01 06:06:00  se-hoon               smile beautiful   
...                   ...      ...                           ...   
1530  1900-01-01 22:17:00      NaN                     glad like   
1532  1900-01-01 22:19:00      NaN              it's really good   
1534  1900-01-01 22:20:00      NaN        like it's really tasty   
1598  1900-01-02 00:37:00      NaN           i'll talk know like   
1600  1900-01-02 00:40:00      NaN             barely talk today   

          Interactions PersonOne PersonTwo  
892      Se-hoon, Ji-a   Se-hoon      Ji-a  
894      Jun-sik, Ji-a   Jun-sik      Ji-a  
898      Se-hoon, Ji-a   Se-hoon      Ji-a  
900

In [151]:
from ipywidgets import interact, Dropdown
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()
maleList = ['Hyeonjung','Se-hoon','Jun-sik','Si-hun','Jin-taek']
femaleList = ['Ji-yeon','So-yeon','Yea-won','Ji-a']

df4 = df3.copy(deep=True)

df5 = df4
df5.dropna(subset=['Translation'])

def calculate_polarity(df5):

    # change data structure
    df5['Polarity'] = df4['Translation'].apply(lambda x: analyser.polarity_scores(x))
    df5 = pd.concat([df5.drop(['Polarity'], axis=1), df5['Polarity'].apply(pd.Series)], axis=1)

    # create sentiment
    df5['Sentiment'] = df5['compound'].apply(lambda x: 'Positive' if x > 0 else 'Neutral' if x==0 else 'Negative')
    return df5

df5 = calculate_polarity(df5)

# calculate sentiment for every pair
pairs = set()
for m in maleList:
    for f in femaleList:
        pairs.add((m,f))

sentiment = {}
for p1,p2 in pairs:
    temp_df = df5.loc[(df5['PersonOne'] == p1)]
    temp_df = temp_df.loc[(temp_df['PersonTwo'] == p2)]
    sentiment[(p1,p2)] = temp_df['compound'].sum()

print(sentiment)

{('Jun-sik', 'Yea-won'): 1.5177, ('Jin-taek', 'Yea-won'): 0.0, ('Jun-sik', 'Ji-a'): 0.3612, ('Jin-taek', 'Ji-a'): 0.0, ('Se-hoon', 'Ji-yeon'): 4.911700000000001, ('Hyeonjung', 'Ji-yeon'): 0.0, ('Jun-sik', 'So-yeon'): 1.4448, ('Jin-taek', 'So-yeon'): 0.7269, ('Se-hoon', 'Yea-won'): 0.0, ('Si-hun', 'Ji-yeon'): 0.0, ('Hyeonjung', 'Yea-won'): 0.0, ('Se-hoon', 'Ji-a'): 0.4892, ('Si-hun', 'Yea-won'): 0.0, ('Hyeonjung', 'Ji-a'): 3.9137, ('Si-hun', 'Ji-a'): 0.0, ('Se-hoon', 'So-yeon'): 0.0, ('Hyeonjung', 'So-yeon'): 0.0, ('Si-hun', 'So-yeon'): 0.0, ('Jun-sik', 'Ji-yeon'): 0.0, ('Jin-taek', 'Ji-yeon'): 0.0}


[nltk_data] Downloading package vader_lexicon to C:\Users\Yue
[nltk_data]     Ning\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [152]:
import matplotlib.pyplot as plt
m = Dropdown(options=maleList)
f = Dropdown(options=femaleList)
@interact(male=m)
def generate_graph(male):
    y_axis = []
    for f in femaleList:
        y_axis.append(sentiment[(male,f)])
    plt.bar(femaleList,y_axis)
    plt.title('Sentiment Male and Female')
    plt.xlabel('Gals')
    plt.ylabel('Sentiment')
    plt.show()


interactive(children=(Dropdown(description='male', options=('Hyeonjung', 'Se-hoon', 'Jun-sik', 'Si-hun', 'Jin-…