In [3]:
#build frames for movie genre classification
import pandas as pd

#method for turning a txt file into a pandas dataframe
def buildFrame(input_file, encoding_type, delim):
    frame = pd.read_table(input_file, encoding=encoding_type, sep=delim)
    return frame

#format method for data frame with movie Genre
def formatFrame(df):
    
    #assign the columns
    df.columns = ["movie_id", "movie_title", "movie_year", "IMDB_rating", "num_IMDB_votes", "genre_list"]
    
    #remove extra characters from genre list
    df['genre_list'] = df['genre_list'].map(lambda x: x.strip('[').strip(']').replace(' ','').replace('\'',''))
    
    #do one-hot encoding for list of genres
    res = pd.get_dummies(df['genre_list'].str.split(',').apply(pd.Series).stack(), drop_first=True).sum(level=0)
    df = df.join(res)
    
    #remove old genre list
    df.drop(['genre_list'], axis = 1, inplace=True)
    
    #check for errors in movie year
    df['movie_year'] = pd.to_numeric(df['movie_year'], errors='coerce')
    return df

df = buildFrame("data/movie_titles_metadata.txt", 'latin1', '\s\+\+\+\$\+\+\+\s')
df = formatFrame(df)
df.head()

  


Unnamed: 0,movie_id,movie_title,movie_year,IMDB_rating,num_IMDB_votes,action,adult,adventure,animation,biography,...,music,musical,mystery,romance,sci-fi,short,sport,thriller,war,western
0,m1,1492: conquest of paradise,1992.0,6.2,10421,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,m2,15 minutes,2001.0,6.1,25854,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,m3,2001: a space odyssey,1968.0,8.4,163227,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
3,m4,48 hrs.,1982.0,6.9,22289,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,m5,the fifth element,1997.0,7.5,133756,1,0,1,0,0,...,0,0,0,1,1,0,0,1,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 29 columns):
movie_id          616 non-null object
movie_title       616 non-null object
movie_year        600 non-null float64
IMDB_rating       616 non-null float64
num_IMDB_votes    616 non-null int64
action            616 non-null uint8
adult             616 non-null uint8
adventure         616 non-null uint8
animation         616 non-null uint8
biography         616 non-null uint8
comedy            616 non-null uint8
crime             616 non-null uint8
documentary       616 non-null uint8
drama             616 non-null uint8
family            616 non-null uint8
fantasy           616 non-null uint8
film-noir         616 non-null uint8
history           616 non-null uint8
horror            616 non-null uint8
music             616 non-null uint8
musical           616 non-null uint8
mystery           616 non-null uint8
romance           616 non-null uint8
sci-fi            616 non-null uint8
s

In [5]:
#correlation matrix to see the features most correlated (Pearson's) with sci-fi
corr_matrix = df.corr()
corr_matrix['sci-fi'].sort_values(ascending=False)

sci-fi            1.000000
adventure         0.373963
action            0.373572
num_IMDB_votes    0.123537
thriller          0.101442
animation         0.091372
mystery           0.091135
fantasy           0.084297
horror            0.058373
family            0.036956
movie_year        0.024064
adult            -0.018371
documentary      -0.031872
western          -0.036746
film-noir        -0.036832
music            -0.040480
short            -0.041214
sport            -0.052260
musical          -0.052260
IMDB_rating      -0.073280
history          -0.085591
war              -0.089725
biography        -0.093703
crime            -0.119345
comedy           -0.121097
romance          -0.162551
drama            -0.238134
Name: sci-fi, dtype: float64

In [6]:
#select a train and test set. 
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
train_set.head()

Unnamed: 0,movie_id,movie_title,movie_year,IMDB_rating,num_IMDB_votes,action,adult,adventure,animation,biography,...,music,musical,mystery,romance,sci-fi,short,sport,thriller,war,western
300,m301,a clockwork orange,1971.0,8.5,197372,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
608,m609,witness,1985.0,7.6,30705,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
249,m250,assassins,1995.0,6.0,23681,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
163,m164,quills,2000.0,7.3,22657,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
462,m463,one flew over the cuckoo's nest,1975.0,8.9,219739,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#verify the data split
total = len(df.loc[df['sci-fi'] > 0])
test = len(test_set.loc[test_set['sci-fi'] > 0])
train = len(train_set.loc[train_set['sci-fi'] > 0])

print('total sci fi = ', total, 'total sci-fi in train = ', train , 'total sci-fi in test = ', test)

total sci fi =  112 total sci-fi in train =  90 total sci-fi in test =  22


In [9]:
#save correct labels
movie_labels = train_set['sci-fi'].copy()

#drop sci-fi label
movie_train = train_set.drop('sci-fi', axis=1)
movie_test = test_set.drop('sci-fi', axis=1)

#remove non-numerical values
movie_train = movie_train.select_dtypes(exclude=['object'])
movie_test = movie_test.select_dtypes(exclude=['object'])

#feature vector to train/test
movie_test.head()

Unnamed: 0,movie_year,IMDB_rating,num_IMDB_votes,action,adult,adventure,animation,biography,comedy,crime,...,horror,music,musical,mystery,romance,short,sport,thriller,war,western
78,1990.0,7.0,11481,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
208,1945.0,7.4,2927,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
570,1971.0,6.8,15741,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
181,1999.0,8.2,244162,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
101,2003.0,6.4,36739,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


In [11]:
#convert Pandas Dataframs to NumPy Arrays
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [12]:
#pipeline for numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

num_attribs = list(movie_train)

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [13]:
#one wrapper for all pipelines
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
])

In [14]:
#perform the full transform on our training data
movies_prepared = full_pipeline.fit_transform(movie_train)
movies_prepared

array([[-1.01804145,  1.34057674,  2.48829697, ..., -0.83168756,
        -0.20042239, -0.13650473],
       [-0.18025537,  0.59511978, -0.30131235, ...,  1.14507708,
        -0.20042239, -0.13650473],
       [ 0.41816326, -0.73013705, -0.41887741, ...,  1.14507708,
        -0.20042239, -0.13650473],
       ...,
       [ 0.8370563 , -0.23316574,  0.05220298, ...,  1.14507708,
        -0.20042239, -0.13650473],
       [ 0.71737258,  1.50623384,  3.88848278, ...,  1.14507708,
        -0.20042239, -0.13650473],
       [-3.23219038,  1.17491964, -0.38714287, ..., -0.83168756,
        -0.20042239, -0.13650473]])

In [15]:
movies_prepared.shape

(492, 26)

In [16]:
#dp a simple linear regression
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(movies_prepared, movie_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
#compute the Mean Square Error (accuracy) of the model ... needs some work!
from sklearn.metrics import mean_squared_error
import numpy as np

genre_predict = lin_reg.predict(movies_prepared)
lin_mse = mean_squared_error(movie_labels, genre_predict)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.36463911194057363

In [18]:
#now wrap the pipline and predictors
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("linear", LinearRegression())
    ])

full_pipeline_with_predictor.fit(movie_train, movie_labels)
full_pipeline_with_predictor.predict(movie_test)

array([-5.54500665e-02,  1.66765703e-01,  2.44118823e-01,  3.23614155e-01,
       -9.28077253e-02,  1.97648355e-01, -8.76840136e-02,  2.04930867e-01,
       -4.45944075e-03, -8.69129874e-02,  2.12985055e-01,  6.60680246e-02,
        6.56823482e-01,  3.15775587e-01,  5.52866952e-01, -9.88798234e-02,
       -3.65877100e-02,  8.31410134e-03,  1.73320387e-01, -3.54466161e-02,
        6.43461601e-01,  1.43548162e-01,  2.05317139e-01,  2.32397865e-01,
        2.80531645e-01,  2.24824869e-01,  3.05758433e-01, -2.21261949e-02,
        3.89568660e-01, -3.59103019e-02,  1.22899711e-01, -3.33730135e-02,
        3.53309101e-02,  1.64549181e-01,  1.48508910e-02,  1.20160747e+00,
       -1.03568728e-01,  9.67807124e-02,  6.35066793e-02,  1.44604734e-01,
        2.73873866e-01,  1.55683053e-01,  1.51749242e-01,  1.08236073e-01,
       -1.75949341e-01, -1.56804288e-02, -6.36660296e-02,  5.08066087e-01,
        4.15728283e-01,  3.72019761e-01, -4.35308117e-02,  7.00147265e-02,
        5.10011339e-01,  

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import FeatureUnion

my_model = full_pipeline_with_predictor

print(movie_labels.iloc[:5])

300    1
608    0
249    0
163    0
462    0
Name: sci-fi, dtype: uint8


In [20]:
from sklearn.externals import joblib
#save file as pickle
joblib.dump(my_model, "movie_genre_model.pkl") # DIFF

#load model as pickle
my_model_loaded = joblib.load("movie_genre_model.pkl") # DIFF

In [62]:
def get_id2line():
    lines=open('data/movie_lines.txt', encoding="ISO-8859-1").read().split('\n')
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            id2line[_line[0]] = _line[4]
    return id2line

def get_conversations():
    conv_lines = open('data/movie_conversations.txt', encoding="ISO-8859-1").read().split('\n')
    convs = [ ]
    for line in conv_lines[:-1]:
        _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        convs.append(_line.split(','))
    return convs


def extract_conversations(convs,id2line,path=''):
    idx = 0
    for conv in convs:
        f_conv = open(path + str(idx)+'.txt', 'w')
        for line_id in conv:
            f_conv.write(id2line[line_id])
            f_conv.write('\n')
        f_conv.close()
        idx += 1

def gather_dataset(convs, id2line):
    questions = []; answers = []

    for conv in convs:
        if len(conv) %2 != 0:
            conv = conv[:-1]
        for i in range(len(conv)):
            if i%2 == 0:
                questions.append(id2line[conv[i]])
            else:
                answers.append(id2line[conv[i]])

    return questions, answers


In [64]:
import pandas as pd
id2line = get_id2line()
convs = get_conversations()
questions, answers = gather_dataset(convs,id2line)

df = pd.DataFrame(convs)

print(convs[:2])
print(questions[:2])
print(answers[:2])
df.head

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199']]
['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', 'Not the hacking and gagging and spitting part.  Please.']
["Well, I thought we'd start with pronunciation, if that's okay with you.", "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]


<bound method NDFrame.head of             0        1        2        3        4        5        6        7   \
0         L194     L195     L196     L197     None     None     None     None   
1         L198     L199     None     None     None     None     None     None   
2         L200     L201     L202     L203     None     None     None     None   
3         L204     L205     L206     None     None     None     None     None   
4         L207     L208     None     None     None     None     None     None   
5         L271     L272     L273     L274     L275     None     None     None   
6         L276     L277     None     None     None     None     None     None   
7         L280     L281     None     None     None     None     None     None   
8         L363     L364     None     None     None     None     None     None   
9         L365     L366     None     None     None     None     None     None   
10        L367     L368     None     None     None     None     None     None  