In [1]:
import nltk
import os
import pandas as pd
import re

## Data Import & Cleaning

Load files, remove unneccessary information and prepare for analysis.

In [2]:
# directory specifications
subtitle_dir = "subtitles"
seasons = ["S01", "S02", "S03", "S04"]

# store imported file content
columns = ('season', 'episode', 'title', 'subtitles')
df = pd.DataFrame(columns=columns)

# loop through all season folders
for s in seasons:
    directory = os.getcwd() + "/subtitles" + "/" + s
    
    # loop through files if directory exists
    try:
        for f in os.listdir(directory):
            if f.endswith(".srt"):
                file_path = os.path.join(directory, f)
                file = open(file_path, mode='r', encoding='utf-8-sig')
                file_content = file.read().splitlines()
                
                df = df.append({'season': int(s[-2:]), 
                                'episode': int(f[4:6]),
                                'title': f[7:-4],
                                'subtitles': file_content}, ignore_index=True)

    # directory does not exist
    except FileNotFoundError:
        print("Directory " + subtitle_dir + "/" + s + " not found.")

Directory subtitles/S02 not found.
Directory subtitles/S03 not found.
Directory subtitles/S04 not found.


In [3]:
def clean_subtitles(subtitles):
    """
    Cleans list of strings imported from .srt file.
    
    Removes:
    - numbering
    - timestamps
    - empty strings
    - contractions
    - specicial characters
    - upper case leters
    
    Returns: List of remaining strings.
    """
    # remove numbering, empty strings & timestamps
    subtitles = [l for l in subtitles if (not l.isdigit() and l != '' and '-->' not in l)]
    
    # replace contractions
    subtitles = [l.replace("'m", " am")
                  .replace("can't", "cannot") 
                  .replace("n't", " not")
                  .replace("'ve", " have")
                  .replace("'re", " are")
                  .replace("'ll", " will")
                  .replace("'d", " would")
                  .replace("'s", "") # could be "is" or posessive
                 for l in subtitles]
    
    # remove special characters
    subtitles = [re.sub("[^a-zA-Z0-9 ]", "" , l) for l in subtitles]
    
    # convert everything to lowercase
    subtitles = [l.lower() for l in subtitles]
    
    return subtitles

# apply cleaning function to df
df['subtitles'] = df['subtitles'].apply(clean_subtitles)

df.head()

Unnamed: 0,season,episode,title,subtitles
0,1,2,eps1.1_ones-and-zer0es.mpeg,"[what i am about to tell you is top secret, ev..."
1,1,8,eps1.7_wh1ter0se.m4v,"[the redundant backups at their, eastern data ..."
2,1,1,eps1.0_hellofriend.mov,"[hello friend, hello friend, that lame, maybe ..."
3,1,3,eps1.2_d3bug.mkv,"[i will be the youngest executive, this compan..."
4,1,9,eps1.8_m1rr0r1ng.qt,"[exciting time in the world right now, exciti..."


## Text Analytics

- Word Frequency: number of words per song
- Word Length: average length of individual words in a text
- Lexical Diversity: number of unique words used in a text (song vocabulary)
- Lexical Density: the number of unique words divided by the total number of words (word repetition)


Identify undesirable words that should be removed.