In [1]:
import re
import string
import warnings
import datetime

import numpy as np
import pandas as pd

from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

warnings.filterwarnings('ignore')

# Data Cleaning

In [2]:
df_sbuzz = pd.read_csv("syndata/Box Office/Recruitment x Box Office x Q1 2019 - Twitter.csv", sep="\t")
df_title = pd.read_csv("syndata/Box Office/Recruitment x Box Office x Q1 2019 - Titles Performace.csv", sep="\t")

In [3]:
df_sbuzz = df_sbuzz.drop(axis=1, labels=['Unnamed: 0'])

df_sbuzz['title'] = df_sbuzz['title'].str.lower()
df_title['Title'] = df_title['Title'].str.lower()

df_title['Title'] = df_title['Title'].apply(lambda x: re.sub("[^\w\s]", "", x))
df_title['Title'] = df_title['Title'].apply(lambda x: re.sub("avengers infinty war", "avengers infinity war", x))

df_sbuzz['title'] = df_sbuzz['title'].apply(lambda x: re.sub("[^\w\s]", "", x))
df_sbuzz['title'] = df_sbuzz['title'].apply(lambda x: re.sub("predator", "the predator", x))

df_sbuzz['post_date'] = pd.to_datetime([datetime.datetime.strptime(x[0], '%Y-%m-%d').date() for x in list(df_sbuzz['post_date_time'].str.split('T'))])
df_sbuzz['post_time'] = [datetime.datetime.strptime(x[1].split('.')[0], '%H:%M:%S').time() for x in list(df_sbuzz['post_date_time'].str.split('T'))]

In [4]:
movie_dict = {}
c = 0

for i in df_sbuzz.index:
    if df_sbuzz.at[i, 'title'] not in movie_dict:
        c+=1
        movie_dict[df_sbuzz.at[i, 'title']] = c

for i in df_title.index:
    if df_title.at[i, 'Title'] not in movie_dict:
        c+=1
        movie_dict[df_title.at[i, 'Title']] = c
        
rev_movie_dict = {v:k for k, v in movie_dict.items()}

In [5]:
# movie_dict

In [6]:
# reformatting should be done better
for i in df_title.index:
    dates = df_title.at[i, 'Date']
    reg_date = re.sub("[.]|[ ]", "", dates)
    dt_list = []
    
    for d1 in reg_date.split('–'):   
        d1 = re.sub("Sept", "Sep", d1)
        m1 = ''
        if len(d1) > 3:
            month = m1.join(re.findall("[a-zA-Z]", d1))
            day = m1.join(re.findall("[0-9]", d1))

            s = '{} {} {}'.format(month, day, df_title.at[i, 'Year'])
            dt_list.append(datetime.datetime.strptime(s, '%b %d %Y').date())
            
        else:
            s = '{} {} {}'.format(month, d1, df_title.at[i, 'Year'])
            dt_list.append(datetime.datetime.strptime(s, '%b %d %Y').date())

    df_title.at[i, 'begin_date'] = dt_list[0]
    df_title.at[i, 'end_date'] = dt_list[1]

    new_range = []
    default_range = range(0, 7)
    wk_range = range(dt_list[0].weekday(), dt_list[1].weekday())
    
    if wk_range.start > wk_range.stop:
        k = wk_range.start
        for _ in range(len(default_range)):
            new_range.append(default_range[k % len(default_range)])
            k+=1
    else:
        new_range = list(wk_range)
    
    df_title.at[i, 'has_weekend'] = 5 in new_range or 6 in new_range
    df_title.at[i, 'nth_week_number'] = dt_list[0].isocalendar()[1]
    df_title.at[i, 'movie_id_t'] = int(movie_dict[df_title.at[i, 'Title']])
    df_title.at[i, 'gross_to_date_float'] = float(re.sub('[^\d.]', '', df_title.at[i, 'Gross-to-Date']))
    try:
        df_title.at[i, 'weekend_gross_float'] = float(re.sub('[^\d.]', '', df_title.at[i, 'Weekend Gross']))
    except:
        df_title.at[i, 'weekend_gross_float'] = 0
        

In [7]:
for i in df_sbuzz.index:
    df_sbuzz.at[i, 'movie_id_s'] = int(movie_dict[df_sbuzz.at[i, 'title']])
    df_sbuzz.at[i, 'nth_week_number'] = df_sbuzz.at[i, 'post_date'].isocalendar()[1]
    df_sbuzz.at[i, 'is_influencer'] = df_sbuzz.at[i, 'user_followers_count'] > 10000

In [8]:
analyzer = SentimentIntensityAnalyzer()
df_sbuzz['tweet_sentiment'] = pd.Series(map(lambda x: x['compound'], df_sbuzz['post_content'].apply(analyzer.polarity_scores)))

In [9]:
df_title.sample(3) 

Unnamed: 0,Title,Date,Year,Rank,Weekend Gross,Gross-to-Date,Week # of release,begin_date,end_date,has_weekend,nth_week_number,movie_id_t,gross_to_date_float,weekend_gross_float
76,black panther,Sept. 13–16,2018,44,"$1,323","$31,937,283",31,2018-09-13,2018-09-16,True,37.0,4.0,31937283.0,1323.0
176,night school,Oct. 4–7,2018,7,"$863,820","$2,842,690",2,2018-10-04,2018-10-07,True,40.0,15.0,2842690.0,863820.0
55,the greatest showman,Jul. 12–15,2018,0,,"$26,670,991",29,2018-07-12,2018-07-15,True,28.0,3.0,26670991.0,0.0


In [10]:
df_sbuzz.sample(3)

Unnamed: 0,ID,title,post_link,post_date_time,post_content,user_name,user_bio,user_followers_count,user_followee_count,user_post_count,country,state,language,post_date,post_time,movie_id_s,nth_week_number,is_influencer,tweet_sentiment
30087,5,wonder woman,http://twitter.com/KIIS1065/status/87964774893...,2017-06-27T10:29:01.000+0000,Kristie from @thethinkergirls loved Wonder Wom...,KIIS1065,Sydney’s #1 Hit Music Station. @KyleandJackieO...,37564,979,53385,AUS,New South Wales,en,2017-06-27,10:29:01,7.0,26.0,True,0.7772
61279,16,bird box,http://twitter.com/imvnwhO/status/107799710468...,2018-12-26T18:38:32.000+0000,RT @BoxMemery #BirdBox . . Bird Box without co...,imvnwhO,you can call me Im ♡ #soulmat,1210,503,161933,AUS,Victoria,en,2018-12-26,18:38:32,16.0,52.0,False,0.0
14457,1,black panther,http://twitter.com/thereversewill/status/96557...,2018-02-19T13:19:45.000+0000,Black panther was really good i could’ve done ...,thereversewill,"if the apocalypse comes, beep me. she/her",49,34,5095,AUS,South Australia,en,2018-02-19,13:19:45,4.0,8.0,False,0.7267


In [11]:
df_sbuzz.to_csv("syndata/Box Office/twitter_clean.csv", sep=",", index=False, date_format="%Y-%m-%d")
df_title.to_csv("syndata/Box Office/box_office_clean.csv", sep=",", index=False, date_format="%Y-%m-%d")

-----------