# Trailer comments pre-processing.

- Clean-up records: remove comments whose text is null.
- Generate sentiment score: using pre-trained model to generate a sentiment score for each comment.
- Label whether use the a comment for training (if a comment is made after the cutoff_date, it shouldn't be used for training)


In [1]:
import pandas as pd
import time
import json
import matplotlib.pyplot as plt
import datetime
import os
import random
from textblob import TextBlob

from multiprocessing import Pool, cpu_count
from functools import partial

# for auto reload a module after editing.
%load_ext autoreload
%autoreload 2

In [91]:
trailer_comments_in_dir = "./data/trailer_comments_raw"
trailer_comments_out_dir = "./data/trailer_comments"

trailer_list_file = "./data/trailer_list/trailer_list_2014-2019.csv"
trailer_list_df = pd.read_csv(trailer_list_file)
f_list = [f for f in os.listdir(trailer_comments_in_dir) if f[-4:] == '.csv']
print(len(f_list))

1805


In [2]:
def process_comment_df(f):
    df = pd.read_csv(os.path.join(trailer_comments_in_dir, f))

    # Remove null records.
    df = df[~df.text.isnull()].reset_index(drop=True)

    # Get sentiment score.
    df.loc[:, 'sentiment_score'] = df.loc[:, 'text'].apply(lambda x: TextBlob(x).sentiment.polarity)

    trailer_id = df.trailer_id[0]
    movie_release_date = trailer_list_df[trailer_list_df.trailer_id == trailer_id].release_date.iloc[0]
    df.loc[:, 'datetime'] = pd.to_datetime(df.loc[:, 'timestamp'], format="%Y-%m-%dT%H:%M:%S")
    release_date = datetime.datetime.strptime(movie_release_date, "%Y-%m-%d")
    model_cutoff_date = pd.Timestamp(release_date - datetime.timedelta(days=7))
    visualize_cutoff_date = pd.Timestamp(release_date + datetime.timedelta(days=90))
    df.loc[:, 'used_for_model'] = df.loc[:, 'datetime'].apply(lambda x : x < model_cutoff_date)
    df.loc[:, 'used_for_visualization'] = df.loc[:, 'datetime'].apply(lambda x : x < visualize_cutoff_date)
    df = df.sort_values('datetime').reset_index(drop=True)
    df.to_csv(os.path.join(trailer_comments_out_dir, f), index=False)

In [93]:
t0 = time.time()
with Pool(cpu_count()) as p:  
    p.map(process_comment_df, f_list)
t1 = time.time()
print("Finished. Time: {0:.1f} s.".format(t1 - t0))

Finished. Time: 1095.4 s.
