In [1]:
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap, BoundaryNorm
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
#nltk.download ('stopwords')
#nltk.download ('punkt')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Lexical Sentiment Analysis of Mr. Robot Subtitles

## Introduction

## Data Import & Cleaning

Load files, remove unneccessary information and prepare for analysis.

In [2]:
import pandas as pd
import os

# directory specifications & plot settings
subtitle_dir = "subtitles"
seasons = ["S01", "S02", "S03", "S04"]
colors = ['#CA3A3C', '#F47B3D', '#61A783', '#41A2AE']

# store imported file content
columns = ('season', 'episode', 'title', 'subtitles')
subtitle_df = pd.DataFrame(columns=columns)


def clean_subtitles(subtitles):
    """
    Cleans list of strings imported from .srt file.
    
    Removes:
    - numbering
    - timestamps
    - empty strings
    - contractions
    - upper case letters
    
    Returns: List of remaining strings.
    """
    # remove numbering, empty strings & timestamps
    subtitles = [l for l in subtitles if (not l.isdigit() and l != '' and '-->' not in l)]
    
    # replace contractions
    subtitles = [l.replace("'m", " am")
                  .replace("can't", "cannot") 
                  .replace("n't", " not")
                  .replace("'ve", " have")
                  .replace("'re", " are")
                  .replace("'ll", " will")
                  .replace("'d", " would")
                  .replace("'s", "") # could be "is" or posessive
                 for l in subtitles]
    
    # convert everything to lowercase
    subtitles = [l.lower() for l in subtitles]

    return subtitles

# loop through all season folders
for s in seasons:
    directory = os.getcwd() + "/subtitles" + "/" + s
    
    # loop through files if directory exists
    try:
        for f in os.listdir(directory):
            if f.endswith(".srt"):
                file_path = os.path.join(directory, f)
                file = open(file_path, mode='r', encoding='utf-8-sig')
                file_content = file.read().splitlines()
                

                subtitles = clean_subtitles(file_content)
                
                subtitle_df = subtitle_df.append({'season': int(s[-2:]), 
                                                  'episode': int(f[4:6]),
                                                  'title': f[7:-4],
                                                  'subtitles': ' '.join(subtitles)}, ignore_index=True)

    # directory does not exist
    except FileNotFoundError:
        print("Directory " + subtitle_dir + "/" + s + " not found.")

# order df by season and episode
subtitle_df = subtitle_df.sort_values(by=['season', 'episode'])
subtitle_df = subtitle_df.reset_index(drop=True)

subtitle_df.head()

Unnamed: 0,season,episode,title,subtitles
0,1,1,eps1.0_hellofriend.mov,"hello, friend. ""hello, friend."" that lame. may..."
1,1,2,eps1.1_ones-and-zer0es.mpeg,"what i am about to tell you is top secret, evi..."
2,1,3,eps1.2_d3bug.mkv,i will be the youngest executive this company ...
3,1,4,eps1.3_da3m0ns.mp4,steel mountain. the hacker dilemma. what to do...
4,1,5,eps1.4_3xpl0its.wmv,he wants to hack steel mountain climate contro...


Get additional metadata for episodes from IMDB

https://www.omdbapi.com/

In [3]:
import requests

# url parameters
base_url = 'http://www.omdbapi.com/?'
key = '95422fcb'


def create_url(base=base_url, key=key, **kwargs):
    
    # add key
    url = "{}apikey={}".format(base, key)
    
    # add other provided parameters
    for key, val in kwargs.items():
        if (key == 'title'):
            url += "&{}".format("t={}".format("+".join(val.split(" "))))
        elif (key == 'season'):
             url += "&{}".format("Season={}".format(val))
        elif (key == 'episode'):
            url += "&{}".format("Episode={}".format(val))
            
    return url


def get_imdb_data(title, season, episode):
    
    url = create_url(title=title, season=season, episode=episode)
    
    r = requests.get(url)
    data = r.json()
    
    print(data)
    
    return {'season': int(data['Season']),
            'episode': int(data['Episode']),
            'rating': data['imdbRating'],
            'runtime': data['Runtime'][:2], #'runtime': int(data['Runtime'][:2]),
           }


# get additional IMDB data for each episode that has a subtitle file
columns = ['season', 'episode', 'rating', 'runtime', 'release_date', 'director']
imdb_df = pd.DataFrame(columns=columns)

for i, row in subtitle_df.iterrows():
    imdb_df = imdb_df.append(get_imdb_data(title="Mr. Robot", season=row['season'], episode=row['episode']), 
                             ignore_index=True) 

# merge IMDB and subtitle data
df = pd.merge(subtitle_df, imdb_df, on=['season', 'episode'], how='left')
df.head()

{'Title': 'eps1.0_hellofriend.mov', 'Year': '2015', 'Rated': 'TV-14', 'Released': '24 Jun 2015', 'Season': '1', 'Episode': '1', 'Runtime': '62 min', 'Genre': 'Crime, Drama, Thriller', 'Director': 'Niels Arden Oplev', 'Writer': 'Sam Esmail (created by), Sam Esmail', 'Actors': 'Rami Malek, Carly Chaikin, Portia Doubleday, Martin Wallström', 'Plot': 'A notorious hacker takes an interest in cyber security engineer and vigilante styled computer hacker Elliot, while an evil corporation is hacked.', 'Language': 'English, Spanish', 'Country': 'USA', 'Awards': 'N/A', 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTY4MDgwMzEwMl5BMl5BanBnXkFtZTgwMzgzMzk5NTE@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '9.3/10'}], 'Metascore': 'N/A', 'imdbRating': '9.3', 'imdbVotes': '11822', 'imdbID': 'tt4652838', 'seriesID': 'tt4158110', 'Type': 'episode', 'Response': 'True'}
{'Title': 'eps1.1_ones-and-zer0es.mpeg', 'Year': '2015', 'Rated': 'TV-14', 'Released': '01 Jul 2015', 

{'Title': 'eps2.0_unm4sk-pt1.tc', 'Year': '2016', 'Rated': 'TV-MA', 'Released': '10 Jul 2016', 'Season': '2', 'Episode': '1', 'Runtime': '41 min', 'Genre': 'Crime, Drama, Thriller', 'Director': 'Sam Esmail', 'Writer': 'Sam Esmail (created by), Sam Esmail, Adam Penn (story editor), Randolph Leon (story editor)', 'Actors': 'Rami Malek, Portia Doubleday, Carly Chaikin, Michael Cristofer', 'Plot': "Elliot keeps seeing his psychologist, Krista. The new character Susan Jacobs is having trouble with technology. Gideon pays a visit to Elliot complaining he's been targeted for a crime ...", 'Language': 'English, Spanish', 'Country': 'USA', 'Awards': 'N/A', 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTg2OTQ3NTUwNV5BMl5BanBnXkFtZTgwNzYyMzk0OTE@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.3/10'}], 'Metascore': 'N/A', 'imdbRating': '8.3', 'imdbVotes': '5107', 'imdbID': 'tt4901088', 'seriesID': 'tt4158110', 'Type': 'episode', 'Response': 'True'}
{'Title': 'e

{'Title': 'eps2.8_h1dden-pr0cess.axx', 'Year': '2016', 'Rated': 'TV-MA', 'Released': '07 Sep 2016', 'Season': '2', 'Episode': '10', 'Runtime': '43 min', 'Genre': 'Crime, Drama, Thriller', 'Director': 'Sam Esmail', 'Writer': 'Sam Esmail (created by), Kor Adana, Randolph Leon, Adam Penn (story editor), Randolph Leon (story editor)', 'Actors': 'Rami Malek, Carly Chaikin, Portia Doubleday, Michael Cristofer', 'Plot': 'Elliot wonders if Mr. Robot has been lying to him; Darlene attempts to do the right thing; Dom and the FBI get closer.', 'Language': 'English, Spanish', 'Country': 'USA', 'Awards': 'N/A', 'Poster': 'https://m.media-amazon.com/images/M/MV5BZTM2ZmUxOTYtZTI0YS00YTJlLWIyZWEtOGUzODhlNjJiZGRhXkEyXkFqcGdeQXVyNTg1MjA0OTA@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '9.3/10'}], 'Metascore': 'N/A', 'imdbRating': '9.3', 'imdbVotes': '4312', 'imdbID': 'tt5347330', 'seriesID': 'tt4158110', 'Type': 'episode', 'Response': 'True'}
{'Title': 'eps2.9_pyth0n-pt1.p

{'Title': 'eps3.7_dont-delete-me.ko', 'Year': '2017', 'Rated': 'TV-MA', 'Released': '29 Nov 2017', 'Season': '3', 'Episode': '8', 'Runtime': '47 min', 'Genre': 'Crime, Drama, Thriller', 'Director': 'Sam Esmail', 'Writer': 'Sam Esmail (created by), Sam Esmail, Adam Penn (executive story editor), Randolph Leon (executive story editor), Courtney Looney (story editor)', 'Actors': 'Rami Malek, Carly Chaikin, Portia Doubleday, Michael Cristofer', 'Plot': 'Elliot tries to get ghosted; it is the day of all days.', 'Language': 'N/A', 'Country': 'N/A', 'Awards': 'N/A', 'Poster': 'https://m.media-amazon.com/images/M/MV5BMGI5YTZjOWYtODAyNy00MTA0LWI2M2EtNTMwMjU4ZGMxMGFlXkEyXkFqcGdeQXVyODE5ODkzMzg@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '9.2/10'}], 'Metascore': 'N/A', 'imdbRating': '9.2', 'imdbVotes': '3645', 'imdbID': 'tt6635212', 'seriesID': 'tt4158110', 'Type': 'episode', 'Response': 'True'}
{'Title': 'eps3.8_stage3.torrent', 'Year': '2017', 'Rated': 'TV-MA', '

{'Title': '408 Request Timeout', 'Year': '2019', 'Rated': 'TV-MA', 'Released': '24 Nov 2019', 'Season': '4', 'Episode': '8', 'Runtime': '44 min', 'Genre': 'Crime, Drama, Thriller', 'Director': 'Sam Esmail', 'Writer': 'Sam Esmail (created by), Ted Kupper (staff writer), Jeff McKibben (Staff Writer), Robbie Pickering', 'Actors': 'Rami Malek, Carly Chaikin, Martin Wallström, Grace Gummer', 'Plot': "Janice tries to get Elliot's location from Dom and Darlene. Elliot goes to the Queens Museum.", 'Language': 'N/A', 'Country': 'N/A', 'Awards': 'N/A', 'Poster': 'https://m.media-amazon.com/images/M/MV5BODBkMzNhOWUtZTRhNi00ZWIyLWI0NTAtYzhhMTE2MzYyZGQ1XkEyXkFqcGdeQXVyODgzNzA5MA@@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '9.4/10'}], 'Metascore': 'N/A', 'imdbRating': '9.4', 'imdbVotes': '2352', 'imdbID': 'tt8084178', 'seriesID': 'tt4158110', 'Type': 'episode', 'Response': 'True'}
{'Title': '409 Conflict', 'Year': '2019', 'Rated': 'TV-MA', 'Released': '01 Dec 2019', 

Unnamed: 0,season,episode,title,subtitles,rating,runtime,release_date,director
0,1,1,eps1.0_hellofriend.mov,"hello, friend. ""hello, friend."" that lame. may...",9.3,62,,
1,1,2,eps1.1_ones-and-zer0es.mpeg,"what i am about to tell you is top secret, evi...",8.7,48,,
2,1,3,eps1.2_d3bug.mkv,i will be the youngest executive this company ...,8.3,46,,
3,1,4,eps1.3_da3m0ns.mp4,steel mountain. the hacker dilemma. what to do...,8.1,46,,
4,1,5,eps1.4_3xpl0its.wmv,he wants to hack steel mountain climate contro...,8.6,45,,


In [None]:
# check rows with missing values
imdb_df.head(50)

## Text Exploration & Analytics


- Episodes With Highest Word Count
- Episode Word Count Distribution
- Word count vs. episode length?
- Most frequently used words per episode/season

### Open Ideas:
- Word cloud with longest words?

- Lexical div./dens. vs. rating (maybe vs. episode length?)

Identify undesirable words that should be removed.

In [None]:
import re

# tokenize subtitles into words
df['word_tokens'] = df['subtitles'].apply(word_tokenize)

# remove special characters
p = re.compile('[^a-zA-Z\d\s:]')
df['word_tokens_clean'] = df['word_tokens'].apply(lambda x: ["".join(l for l in w if not p.match(w)) for w in x])
df['word_tokens_clean'] = df['word_tokens_clean'].apply(lambda x: [w for w in x if w])

## remove stop words from word tokens
stop_words = set(stopwords.words("english"))
df['word_tokens_clean'] = df['word_tokens_clean'].apply(lambda x: [w for w in x if w not in stop_words])

In [None]:
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

# number of words per episode
fig, ax1 = plt.subplots(1, 1, figsize=(15, 10))

episodes = df.index
seasons = df['season']
word_counts = df['word_tokens_clean'].str.len()
color_array = [
    colors[0] if s == 1 
    else colors[1] if s == 2 
    else colors[2] if s == 3
    else colors[3] if s == 4
    else '' for s in seasons]

# word count
ax1.bar(episodes, word_counts, color=color_array, alpha=0.9)
ax1.set_xlim([-1, len(episodes)])
ax1.set_xlabel('Episode')
ax1.set_xticks(episodes)
ax1.set_xticklabels(episodes)
ax1.set_ylabel('Word Count')

# runtime
ax2 = ax1.twinx()
ax2.plot(episodes, df['runtime'], color='k', marker='o')
ax2.set_ylabel('Runtime / min')

# add legend
legend_elements = [
    Patch(color=colors[0], label='Season 1'),
    Patch(color=colors[1], label='Season 2'),
    Patch(color=colors[2], label='Season 3'),
    Patch(color=colors[3], label='Season 4'),
    Line2D([0], [0], color='k', marker='o', label='Runtime')]
ax2.legend(handles=legend_elements)

plt.show()

Describe Wordcount/Runtime graph

Explain how to count & visualize most common words in each season

In [None]:
from collections import Counter

# plot frequency distributions for cleaned word lists for each season
fig, axs = plt.subplots(2, 2, figsize=(15,10))
axs = [ax for sublist in axs for ax in sublist]

# aggregate cleaned words for all seasons
season_word_tokens_clean = df.groupby('season')['word_tokens_clean'].sum()

for i, row in enumerate(season_word_tokens_clean):
    counts = Counter(row).most_common(10)

    axs[i].barh([x[0] for x in counts], [x[1] for x in counts], color=colors[i])
    axs[i].set_title('Season ' + str((i + 1)))
    axs[i].set_xlabel('Word Count')
    axs[i].invert_yaxis()

plt.show()

tokenize of NLTK splits contractions like gonna and wanna -> gon na fragments


list of uninteresting words -> TF-IDF??!! (allows to keep stop words), Stemming?!

In [None]:
# implement TF-IDF
##
##
##
##

Explain results after TD-IDF/stemming

Details of final step of lexical analysis -> Formulas for diversity and density

In [None]:
# number of unique words used per episode
def lexical_diversity(words):
    return len(set(words))
df['lexical_diversity'] = df['word_tokens_clean'].apply(lexical_diversity)

# diversity in relation to number of used words
def lexical_density(words):
    return(lexical_diversity(words) / len(words))
df['lexical_density'] = df['word_tokens_clean'].apply(lexical_density)

# plot lexical analysis
def create_segments(x, y):
    """Split line into segments between two points to allow different line colors."""
    points = np.array([x, y]).T.reshape(-1, 1, 2)
    segments = np.concatenate([points[:-1], points[1:]], axis=1)
    
    return segments

fig, ax1 = plt.subplots(1, 1, figsize=(15,10))

episodes = list(df.index)
seasons = df['season']
color_array = [
    colors[0] if s == 1 
    else colors[1] if s == 2 
    else colors[2] if s == 3
    else colors[3] if s == 4
    else '' for s in seasons]

# draw line between lexical diversity values
ldiv_segments = create_segments(episodes, df['lexical_diversity'])
lc = LineCollection(ldiv_segments, color=color_array, alpha=0.6)
lc.set_linewidth(2)
ldiv_line = ax1.add_collection(lc)

# plot lexical diversity values
ldiv_scatter = ax1.scatter(episodes, df['lexical_diversity'], marker='o', s=10**2, 
                         c=color_array, label='Lexical Diversity')

ax1.set_xlim([-0.5, len(episodes) - 0.5])
ax1.set_ylim([0, 1400])
ax1.set_xlabel('Episode')
ax1.set_xticks(episodes)
ax1.set_xticklabels(episodes)
ax1.set_ylabel('Lexical Diversity')

# draw line between lexical density values
ax2 = ax1.twinx()

ldens_segments = create_segments(episodes, df['lexical_density'])
lc = LineCollection(ldens_segments, color=color_array, alpha=0.6)
lc.set_linewidth(2)
ldens_line = ax2.add_collection(lc)

# plot lexical density values
ldens_scatter = ax2.scatter(episodes, df['lexical_density'], marker='s', s=10**2,
                           c=color_array, label='Lexical Density')
ax2.set_ylim([0, 1])
ax2.set_ylabel('Lexical Density')

# add grid that matches both y-axis
ax1.set_yticks(np.linspace(ax1.get_yticks()[0], ax1.get_yticks()[-1], len(ax2.get_yticks())))
ax1.grid()

## ensure that ticks for y1 are nice numbers?!
##
##
##


# add legend
legend_elements = [
    Line2D([0], [0], marker='o', color='k', label='Lexical Diversity',
           markerfacecolor='k', markersize=10),
    Line2D([0], [0], marker='s', color='k', label='Lexical Density',
           markerfacecolor='k', markersize=10)]
ax2.legend(handles=legend_elements)

plt.show()

## VADER Sentiment Analysis

VADER works best on short texts (a couple sentences at most), and applying it to an entire chapter at once resulted in extreme and largely worthless scores. Instead, I looped over each sentence individually, got the VADER scores, and then took an average of all sentences in a chapter.

Note that there are a lot of sentences without sentiment, i.e. having sentiment score zero. Including these causes the mean polarity to be almost zero. I decided to ignore non-sentiment-bearing sentence in the analysis.

https://github.com/cjhutto/vaderSentiment#about-the-scoring

https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

In [None]:
# tokenize subtitles into sentences
df['sentence_tokens'] = df['subtitles'].apply(sent_tokenize)

# calculate sentence Vader scores for all episodes
def analyzeSentences(sentences):
    """Calculate and return VADER compound values for list of strings."""
    analyzer = SentimentIntensityAnalyzer()
    
    compound_scores = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        compound_scores.append(vs['compound'])
        
    return compound_scores

df['sentence_scores'] = df['sentence_tokens'].apply(analyzeSentences)
df['mean_scores'] = df['sentence_scores'].apply(lambda x: sum(x) / len(x))

# mean scores without zero
df['sentence_scores_w0'] = df['sentence_scores'].apply(lambda x: [i for i in x if i != 0])

df['mean_scores_w0'] = df['sentence_scores_w0'].apply(lambda x: sum(x) / len(x))

# plot VADER mean scores for all episodes
fig, ax = plt.subplots(1, 1, figsize=(15,10))

episodes = list(df.index)
seasons = df['season']
color_array = [
    colors[0] if s == 1 
    else colors[1] if s == 2 
    else colors[2] if s == 3
    else colors[3] if s == 4
    else '' for s in seasons]

mean_scores = df['mean_scores']
mean_scores_w0 = df['mean_scores_w0']

mean_compound_score_w0_segments = create_segments(episodes, mean_scores_w0)
lc = LineCollection(mean_compound_score_w0_segments, color=color_array)
lc.set_linewidth(2)
mean_compound_score_w0_line = ax.add_collection(lc)

mean_compound_score_w0_scatter = ax.scatter(episodes, mean_scores_w0, marker='o', s=80, 
                         c=color_array)

# line including 0 values
mean_compound_score_segments = create_segments(episodes, mean_scores)
lc = LineCollection(mean_compound_score_segments, color=color_array, alpha=0.3)
lc.set_linewidth(2)
mean_compound_score_line = ax.add_collection(lc)

ax.set_xlabel('Episode')
ax.set_xticks(episodes)
ax.set_xticklabels(episodes)
ax.set_xlim([-0.5, len(episodes) - 0.5])
ax.set_ylabel('Average Sentiment')
ax.set_ylim([mean_scores_w0.min(), mean_scores_w0.max()])

# add line for IMDB ratings
##
##
##
##


# add legend for both sentiment line types and ratings


# add grid that matches both y-axis
#ax1.set_yticks(np.linspace(ax1.get_yticks()[0], ax1.get_yticks()[-1], len(ax2.get_yticks())))
#ax1.grid()

## ensure that ticks for y1 are nice numbers?!
##
##
##


plt.show()

In [None]:
df['rating']

In [None]:
# vertical bar plot with decreasing sentiment order, show 5 highest & lowest episodes
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15,10), sharex=True)

mean_scores_w0_ordered = df[['season', 'title', 'mean_scores_w0']].sort_values(by=['mean_scores_w0'], 
                                                                               ascending=True)
seasons = mean_scores_w0_ordered['season']
titles = mean_scores_w0_ordered['title']
scores = mean_scores_w0_ordered['mean_scores_w0']
color_array = [
    colors[0] if s == 1 
    else colors[1] if s == 2 
    else colors[2] if s == 3
    else colors[3] if s == 4
    else '' for s in seasons]

# plot data on both axes
ax1.barh(titles, scores, color=color_array)
ax2.barh(titles, scores, color=color_array)

# set view of subplots
ax1.set_ylim([len(titles) - 5.5, len(titles) - 0.5])
ax2.set_ylim([-0.5, 5.5])

# hide ticks between axes
ax1.spines['bottom'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax1.xaxis.tick_top()
ax1.tick_params(labeltop=False)
ax2.xaxis.tick_bottom()

# create diagonal lines for axis break
d = .005  # size in axes coordinates

kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal

ax2.set_xlabel('Average Sentiment')

# add color legend for seasons
##
##
##
##

# move ticklabels inside graph, top five on left sid of bar, bottom five on right side of bar
##
##
##
##

# add legend
legend_elements = [
    Patch(color=colors[0], label='Season 1'),
    Patch(color=colors[1], label='Season 2'),
    Patch(color=colors[2], label='Season 3'),
    Patch(color=colors[3], label='Season 4')]
ax1.legend(handles=legend_elements)


fig.subplots_adjust(hspace=0.05) # reduce distance between subplots
plt.show()

## Conclusion
