# Part 0: Load packages and Read in Data

In [6]:
# load packages
%matplotlib inline
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import seaborn as sns
import sklearn as sl
from sklearn import preprocessing
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Read in data
* 'Video_id' is defined as the index of each movie. 'cvt_per_day' is the metric, which is defined as cumulative time viewed by audiences per day. The rest of 14 columns are all characteristic features (numeric & categorical) of each video.
* There are 4226 rows and 16 columns in the dataset. 

In [3]:
df = pd.read_table('TVdata.txt', sep = ',', header=0, lineterminator = '\n')
df.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category
0,385504,307127.606,1,3,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301,51,other award,PG-13,1.71
1,300175,270338.426,1,3,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.983,41,no award,R,3.25
2,361899,256165.867,1,3,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.764,58,other award,R,2.647
3,308314,196622.721,3,4,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.704,94,Oscar,R,1.667
4,307201,159841.652,1,3,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.546,37,other award,R,3.067


In [5]:
print('Number of rows: ' + str(df.shape[0]))
print('Number of columns: ' + str(df.shape[1]))

Number of rows: 4226
Number of columns: 16


# Part 1: Data Exploration

## 1.1 Remove Duplication
* This step checks and removes any duplicate entries, given the assumption that each movie should only appear once in the dataset. 

In [7]:
# check duplicated entry
duplicated = len(df.set_index('video_id').index.get_duplicates()) 
print('There is {} duplicated entry (video_id based).'.format(duplicated))

There is 0 duplicated entry (video_id based).


## 1.2 Check column attributes
* This step is important as we want to know whether the column is numeric or categorical.

In [9]:
df.dtypes

video_id                           int64
cvt_per_day                      float64
weighted_categorical_position      int64
weighted_horizontal_poition        int64
import_id                         object
release_year                       int64
genres                            object
imdb_votes                         int64
budget                             int64
boxoffice                          int64
imdb_rating                      float64
duration_in_mins                 float64
metacritic_score                   int64
awards                            object
mpaa                              object
star_category                    float64
dtype: object