In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from itertools import chain

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

%matplotlib inline

In [2]:
# read the excel file
df = pd.read_excel('data/all_with_liwc_segmented.xls', index_col=0)
# print(df.shape)
# display(df.head())

In [3]:
# there's a lot of columns I don't want to use yet, create subset
df = df[df.columns[:34]]
df.drop(['persuasive', 'inspiring', 'unconvincing', 
         'norm_persuasive', 'norm_inspiring', 'norm_unconvincing',
         'music', 'conversation'],
        axis=1, inplace=True)
# show first entry
# df.iloc[0]

In [4]:
# convert the datetime columns from unix timestamp to datetime
df['film_date'] = df['film_date'].apply(datetime.datetime.fromtimestamp)
df['published_date'] = df['published_date'].apply(datetime.datetime.fromtimestamp)
# df.iloc[0]

In [5]:
# df.iloc[0]['ratings']

In [6]:
'''
Currently ratings, related_talks, and tags are stored as strings.
Need to convert them to lists by evaluating the string.
'''
df['ratings'] = df['ratings'].apply(eval)
df['related_talks'] = df['related_talks'].apply(eval)
df['tags'] = df['tags'].apply(eval)
# df.iloc[0]

In [7]:
'''
Instead of having the ratings buried in a column of lists of dictionaries,
I want to make each individual rating category a new column.
The author of the dataset scraped 14 fixed rating categories from the source
and I will use those rating categories as new columns. These will probably
end up being my target variables in one way or another.

I'll also create a normed column for each rating which is just the count
of that rating category divided by the total number of views. This will
ensure that the ratings are comparable across talks
'''

# create list with rating categories for new columns
new_cols = sorted([x['name'] for x in df.iloc[0]['ratings']])
# create list same as above with 'norm_' in front
normed_cols = ['norm_{}'.format(x) for x in new_cols]

# create the new cols filled with nans
for nc in chain(new_cols, normed_cols):
    df[nc] = np.nan

# fill the columns with proper rating and normed rating
for i in df.index:
    for rating in df.loc[i]['ratings']:
        df.loc[i, rating['name']] = rating['count']
        df.loc[i, 'norm_{}'.format(rating['name'])] = rating['count'] / df.loc[i, 'views']

# df.head()

In [8]:
# start nlp on transcripts
transcript = df['transcript'].copy()

In [14]:
# convert stop words to a set for speed
stop_words = set(stopwords.words('english'))

# create the preprocessing function to apply to the transcripts
def preprocess(sentence):
    # convert to lowercase
    sentence = sentence.lower()
    # tokenize words and remove punctuation
    tokenizer = RegexpTokenizer(r'[a-zA-Z]\w+\'?\w*')
    tokens = tokenizer.tokenize(sentence)
    # create lemmatizer
    lemmatizer = WordNetLemmatizer()
    # remove stop words and lemmatize words
    filtered_words = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return filtered_words

In [15]:
filtered_words = transcript.apply(preprocess)