In [158]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import time
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def read_movie_titles():
    with open('Project2-data/movie_titles.txt', 'r', encoding = "ISO-8859-1") as fp:
        movie_titles = fp.read().splitlines()

    rule = "(\d+),(.*),(.*)"
    l = []
    for title in movie_titles:
        hit = re.match(rule, title)
        l.append([hit[1], hit[2], hit[3]])
    return pd.DataFrame(l, columns=['id', 'mid', 'title'])

df_title = read_movie_titles()
df_user = pd.read_csv('Project2-data/users.txt', sep=' ', header=None, names=['uid'])
df_train = pd.read_csv('Project2-data/netflix_train.txt', sep=' ', header=None, names=['uid', 'mid', 'rating', 'time'])
df_test = pd.read_csv('Project2-data/netflix_test.txt', sep=' ', header=None, names=['uid', 'mid', 'rating', 'time'])

In [124]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    return doc

sentences = [preprocess(title) for title in df_title.title]
model = Word2Vec(sentences, size=300, min_count=1, workers=8, negative=5)
model.wv.save_word2vec_format('netflix.emb')

embedding = pd.read_csv('netflix.emb',sep=' ',skiprows=1,header = None, index_col=0)
emb_title = pd.DataFrame([embedding.ix[words].mean() for words in sentences])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [159]:
import lightgbm as lgb

def preprocessing(df):
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df.time.dt.year
    df['month'] = df.time.dt.month
    df['day'] = df.time.dt.day
    df['dayofweek'] = df.time.dt.dayofweek
    df['dayofyear'] = df.time.dt.dayofyear
    df['weekofyear'] = df.time.dt.weekofyear
    
    y = df.pop('rating')
    X = df.drop('time', axis=1)
    data = lgb.Dataset(X, y)
    
    return data

lgb_train = preprocessing(df_train)
lgb_eval = preprocessing(df_test)

In [168]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2_root'},
    'max_depth': 10,
    'num_leaves': 31,
    'learning_rate': 0.08,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Start training...')
gbm = lgb.train(params, 
                lgb_train, 
                num_boost_round=2000, 
                valid_sets=[lgb_train, lgb_eval], 
                early_stopping_rounds=5)


print('Save model...')
gbm.save_model('model.txt')

Start training...
[1]	training's rmse: 1.10476	valid_1's rmse: 1.10468
Training until validation scores don't improve for 5 rounds.
[2]	training's rmse: 1.10359	valid_1's rmse: 1.10352
[3]	training's rmse: 1.10266	valid_1's rmse: 1.1026
[4]	training's rmse: 1.10177	valid_1's rmse: 1.10171
[5]	training's rmse: 1.10107	valid_1's rmse: 1.10102
[6]	training's rmse: 1.10037	valid_1's rmse: 1.10032
[7]	training's rmse: 1.09973	valid_1's rmse: 1.09968
[8]	training's rmse: 1.0992	valid_1's rmse: 1.09917
[9]	training's rmse: 1.09872	valid_1's rmse: 1.09869
[10]	training's rmse: 1.09825	valid_1's rmse: 1.09822
[11]	training's rmse: 1.09788	valid_1's rmse: 1.09785
[12]	training's rmse: 1.09754	valid_1's rmse: 1.09751
[13]	training's rmse: 1.09721	valid_1's rmse: 1.09719
[14]	training's rmse: 1.09694	valid_1's rmse: 1.09692
[15]	training's rmse: 1.09671	valid_1's rmse: 1.09669
[16]	training's rmse: 1.09654	valid_1's rmse: 1.09652
[17]	training's rmse: 1.09626	valid_1's rmse: 1.09625
[18]	training'

[153]	training's rmse: 1.08141	valid_1's rmse: 1.08166
[154]	training's rmse: 1.08133	valid_1's rmse: 1.08158
[155]	training's rmse: 1.08122	valid_1's rmse: 1.08148
[156]	training's rmse: 1.0812	valid_1's rmse: 1.08145
[157]	training's rmse: 1.08116	valid_1's rmse: 1.08142
[158]	training's rmse: 1.08114	valid_1's rmse: 1.08139
[159]	training's rmse: 1.08106	valid_1's rmse: 1.08132
[160]	training's rmse: 1.08103	valid_1's rmse: 1.08129
[161]	training's rmse: 1.08096	valid_1's rmse: 1.08122
[162]	training's rmse: 1.0809	valid_1's rmse: 1.08116
[163]	training's rmse: 1.08082	valid_1's rmse: 1.08108
[164]	training's rmse: 1.08076	valid_1's rmse: 1.08103
[165]	training's rmse: 1.08074	valid_1's rmse: 1.081
[166]	training's rmse: 1.08065	valid_1's rmse: 1.08091
[167]	training's rmse: 1.0806	valid_1's rmse: 1.08086
[168]	training's rmse: 1.0805	valid_1's rmse: 1.08077
[169]	training's rmse: 1.08044	valid_1's rmse: 1.08071
[170]	training's rmse: 1.08036	valid_1's rmse: 1.08062
[171]	training's

[304]	training's rmse: 1.07398	valid_1's rmse: 1.07443
[305]	training's rmse: 1.07394	valid_1's rmse: 1.07439
[306]	training's rmse: 1.07389	valid_1's rmse: 1.07435
[307]	training's rmse: 1.07388	valid_1's rmse: 1.07433
[308]	training's rmse: 1.07385	valid_1's rmse: 1.0743
[309]	training's rmse: 1.07382	valid_1's rmse: 1.07427
[310]	training's rmse: 1.07378	valid_1's rmse: 1.07423
[311]	training's rmse: 1.07372	valid_1's rmse: 1.07418
[312]	training's rmse: 1.07367	valid_1's rmse: 1.07413
[313]	training's rmse: 1.07364	valid_1's rmse: 1.0741
[314]	training's rmse: 1.0736	valid_1's rmse: 1.07406
[315]	training's rmse: 1.07356	valid_1's rmse: 1.07402
[316]	training's rmse: 1.07354	valid_1's rmse: 1.074
[317]	training's rmse: 1.07351	valid_1's rmse: 1.07398
[318]	training's rmse: 1.0735	valid_1's rmse: 1.07396
[319]	training's rmse: 1.07348	valid_1's rmse: 1.07395
[320]	training's rmse: 1.07346	valid_1's rmse: 1.07393
[321]	training's rmse: 1.07344	valid_1's rmse: 1.0739
[322]	training's 

[455]	training's rmse: 1.06939	valid_1's rmse: 1.07003
[456]	training's rmse: 1.06936	valid_1's rmse: 1.07
[457]	training's rmse: 1.06934	valid_1's rmse: 1.06998
[458]	training's rmse: 1.06932	valid_1's rmse: 1.06996
[459]	training's rmse: 1.06926	valid_1's rmse: 1.0699
[460]	training's rmse: 1.06924	valid_1's rmse: 1.06989
[461]	training's rmse: 1.0692	valid_1's rmse: 1.06985
[462]	training's rmse: 1.06917	valid_1's rmse: 1.06982
[463]	training's rmse: 1.06915	valid_1's rmse: 1.06981
[464]	training's rmse: 1.06911	valid_1's rmse: 1.06977
[465]	training's rmse: 1.06909	valid_1's rmse: 1.06975
[466]	training's rmse: 1.06907	valid_1's rmse: 1.06973
[467]	training's rmse: 1.06904	valid_1's rmse: 1.06971
[468]	training's rmse: 1.06901	valid_1's rmse: 1.06968
[469]	training's rmse: 1.069	valid_1's rmse: 1.06967
[470]	training's rmse: 1.06897	valid_1's rmse: 1.06964
[471]	training's rmse: 1.06894	valid_1's rmse: 1.06961
[472]	training's rmse: 1.0689	valid_1's rmse: 1.06958
[473]	training's r

[605]	training's rmse: 1.06557	valid_1's rmse: 1.06641
[606]	training's rmse: 1.06555	valid_1's rmse: 1.0664
[607]	training's rmse: 1.06552	valid_1's rmse: 1.06637
[608]	training's rmse: 1.06549	valid_1's rmse: 1.06634
[609]	training's rmse: 1.06545	valid_1's rmse: 1.0663
[610]	training's rmse: 1.06543	valid_1's rmse: 1.06628
[611]	training's rmse: 1.06542	valid_1's rmse: 1.06627
[612]	training's rmse: 1.06539	valid_1's rmse: 1.06624
[613]	training's rmse: 1.06538	valid_1's rmse: 1.06623
[614]	training's rmse: 1.06537	valid_1's rmse: 1.06622
[615]	training's rmse: 1.06537	valid_1's rmse: 1.06622
[616]	training's rmse: 1.06534	valid_1's rmse: 1.06619
[617]	training's rmse: 1.06532	valid_1's rmse: 1.06617
[618]	training's rmse: 1.06531	valid_1's rmse: 1.06616
[619]	training's rmse: 1.06529	valid_1's rmse: 1.06614
[620]	training's rmse: 1.06527	valid_1's rmse: 1.06613
[621]	training's rmse: 1.06526	valid_1's rmse: 1.06612
[622]	training's rmse: 1.06525	valid_1's rmse: 1.0661
[623]	trainin

[755]	training's rmse: 1.06248	valid_1's rmse: 1.06349
[756]	training's rmse: 1.06245	valid_1's rmse: 1.06346
[757]	training's rmse: 1.06244	valid_1's rmse: 1.06346
[758]	training's rmse: 1.06241	valid_1's rmse: 1.06342
[759]	training's rmse: 1.06239	valid_1's rmse: 1.06341
[760]	training's rmse: 1.06238	valid_1's rmse: 1.06339
[761]	training's rmse: 1.06233	valid_1's rmse: 1.06335
[762]	training's rmse: 1.06231	valid_1's rmse: 1.06333
[763]	training's rmse: 1.0623	valid_1's rmse: 1.06332
[764]	training's rmse: 1.06229	valid_1's rmse: 1.06332
[765]	training's rmse: 1.06228	valid_1's rmse: 1.06331
[766]	training's rmse: 1.06225	valid_1's rmse: 1.06328
[767]	training's rmse: 1.06223	valid_1's rmse: 1.06325
[768]	training's rmse: 1.06222	valid_1's rmse: 1.06325
[769]	training's rmse: 1.0622	valid_1's rmse: 1.06323
[770]	training's rmse: 1.06216	valid_1's rmse: 1.06319
[771]	training's rmse: 1.06213	valid_1's rmse: 1.06316
[772]	training's rmse: 1.06212	valid_1's rmse: 1.06315
[773]	traini

[906]	training's rmse: 1.05979	valid_1's rmse: 1.06101
[907]	training's rmse: 1.05977	valid_1's rmse: 1.06098
[908]	training's rmse: 1.05974	valid_1's rmse: 1.06095
[909]	training's rmse: 1.05971	valid_1's rmse: 1.06093
[910]	training's rmse: 1.0597	valid_1's rmse: 1.06091
[911]	training's rmse: 1.05969	valid_1's rmse: 1.06091
[912]	training's rmse: 1.05969	valid_1's rmse: 1.06091
[913]	training's rmse: 1.05968	valid_1's rmse: 1.0609
[914]	training's rmse: 1.05966	valid_1's rmse: 1.06088
[915]	training's rmse: 1.05964	valid_1's rmse: 1.06087
[916]	training's rmse: 1.05961	valid_1's rmse: 1.06084
[917]	training's rmse: 1.05958	valid_1's rmse: 1.06081
[918]	training's rmse: 1.05956	valid_1's rmse: 1.06079
[919]	training's rmse: 1.05954	valid_1's rmse: 1.06077
[920]	training's rmse: 1.05952	valid_1's rmse: 1.06075
[921]	training's rmse: 1.05949	valid_1's rmse: 1.06071
[922]	training's rmse: 1.05948	valid_1's rmse: 1.06071
[923]	training's rmse: 1.05946	valid_1's rmse: 1.06069
[924]	traini

[1057]	training's rmse: 1.05755	valid_1's rmse: 1.05894
[1058]	training's rmse: 1.05753	valid_1's rmse: 1.05892
[1059]	training's rmse: 1.05752	valid_1's rmse: 1.05891
[1060]	training's rmse: 1.05749	valid_1's rmse: 1.05889
[1061]	training's rmse: 1.05748	valid_1's rmse: 1.05888
[1062]	training's rmse: 1.05748	valid_1's rmse: 1.05887
[1063]	training's rmse: 1.05746	valid_1's rmse: 1.05886
[1064]	training's rmse: 1.05744	valid_1's rmse: 1.05884
[1065]	training's rmse: 1.05742	valid_1's rmse: 1.05882
[1066]	training's rmse: 1.05741	valid_1's rmse: 1.05882
[1067]	training's rmse: 1.0574	valid_1's rmse: 1.05881
[1068]	training's rmse: 1.0574	valid_1's rmse: 1.0588
[1069]	training's rmse: 1.05739	valid_1's rmse: 1.0588
[1070]	training's rmse: 1.05738	valid_1's rmse: 1.05879
[1071]	training's rmse: 1.05736	valid_1's rmse: 1.05877
[1072]	training's rmse: 1.05734	valid_1's rmse: 1.05875
[1073]	training's rmse: 1.05731	valid_1's rmse: 1.05873
[1074]	training's rmse: 1.0573	valid_1's rmse: 1.058

[1205]	training's rmse: 1.05558	valid_1's rmse: 1.05715
[1206]	training's rmse: 1.05556	valid_1's rmse: 1.05713
[1207]	training's rmse: 1.05555	valid_1's rmse: 1.05711
[1208]	training's rmse: 1.05553	valid_1's rmse: 1.0571
[1209]	training's rmse: 1.05553	valid_1's rmse: 1.0571
[1210]	training's rmse: 1.05552	valid_1's rmse: 1.05709
[1211]	training's rmse: 1.05551	valid_1's rmse: 1.05708
[1212]	training's rmse: 1.0555	valid_1's rmse: 1.05708
[1213]	training's rmse: 1.05549	valid_1's rmse: 1.05706
[1214]	training's rmse: 1.05546	valid_1's rmse: 1.05703
[1215]	training's rmse: 1.05544	valid_1's rmse: 1.05702
[1216]	training's rmse: 1.05543	valid_1's rmse: 1.057
[1217]	training's rmse: 1.05541	valid_1's rmse: 1.05698
[1218]	training's rmse: 1.0554	valid_1's rmse: 1.05697
[1219]	training's rmse: 1.05537	valid_1's rmse: 1.05695
[1220]	training's rmse: 1.05535	valid_1's rmse: 1.05693
[1221]	training's rmse: 1.05535	valid_1's rmse: 1.05693
[1222]	training's rmse: 1.05535	valid_1's rmse: 1.0569

[1354]	training's rmse: 1.05372	valid_1's rmse: 1.05546
[1355]	training's rmse: 1.05369	valid_1's rmse: 1.05544
[1356]	training's rmse: 1.05366	valid_1's rmse: 1.05541
[1357]	training's rmse: 1.05365	valid_1's rmse: 1.0554
[1358]	training's rmse: 1.05365	valid_1's rmse: 1.0554
[1359]	training's rmse: 1.05363	valid_1's rmse: 1.05538
[1360]	training's rmse: 1.05362	valid_1's rmse: 1.05537
[1361]	training's rmse: 1.05362	valid_1's rmse: 1.05537
[1362]	training's rmse: 1.05361	valid_1's rmse: 1.05536
[1363]	training's rmse: 1.05359	valid_1's rmse: 1.05535
[1364]	training's rmse: 1.05359	valid_1's rmse: 1.05534
[1365]	training's rmse: 1.05358	valid_1's rmse: 1.05534
[1366]	training's rmse: 1.05357	valid_1's rmse: 1.05532
[1367]	training's rmse: 1.05356	valid_1's rmse: 1.05532
[1368]	training's rmse: 1.05356	valid_1's rmse: 1.05532
[1369]	training's rmse: 1.05355	valid_1's rmse: 1.05531
[1370]	training's rmse: 1.05354	valid_1's rmse: 1.0553
[1371]	training's rmse: 1.05352	valid_1's rmse: 1.0

[1501]	training's rmse: 1.05219	valid_1's rmse: 1.05411
[1502]	training's rmse: 1.05218	valid_1's rmse: 1.0541
[1503]	training's rmse: 1.05216	valid_1's rmse: 1.05408
[1504]	training's rmse: 1.05215	valid_1's rmse: 1.05408
[1505]	training's rmse: 1.05215	valid_1's rmse: 1.05407
[1506]	training's rmse: 1.05214	valid_1's rmse: 1.05406
[1507]	training's rmse: 1.05213	valid_1's rmse: 1.05405
[1508]	training's rmse: 1.05213	valid_1's rmse: 1.05405
[1509]	training's rmse: 1.05212	valid_1's rmse: 1.05404
[1510]	training's rmse: 1.05211	valid_1's rmse: 1.05403
[1511]	training's rmse: 1.0521	valid_1's rmse: 1.05403
[1512]	training's rmse: 1.0521	valid_1's rmse: 1.05402
[1513]	training's rmse: 1.05208	valid_1's rmse: 1.05401
[1514]	training's rmse: 1.05206	valid_1's rmse: 1.05399
[1515]	training's rmse: 1.05206	valid_1's rmse: 1.05399
[1516]	training's rmse: 1.05204	valid_1's rmse: 1.05397
[1517]	training's rmse: 1.05203	valid_1's rmse: 1.05396
[1518]	training's rmse: 1.05202	valid_1's rmse: 1.0

[1650]	training's rmse: 1.05046	valid_1's rmse: 1.05254
[1651]	training's rmse: 1.05045	valid_1's rmse: 1.05254
[1652]	training's rmse: 1.05043	valid_1's rmse: 1.05252
[1653]	training's rmse: 1.0504	valid_1's rmse: 1.05249
[1654]	training's rmse: 1.0504	valid_1's rmse: 1.05249
[1655]	training's rmse: 1.05039	valid_1's rmse: 1.05248
[1656]	training's rmse: 1.05038	valid_1's rmse: 1.05248
[1657]	training's rmse: 1.05037	valid_1's rmse: 1.05247
[1658]	training's rmse: 1.05035	valid_1's rmse: 1.05245
[1659]	training's rmse: 1.05035	valid_1's rmse: 1.05244
[1660]	training's rmse: 1.05034	valid_1's rmse: 1.05244
[1661]	training's rmse: 1.05034	valid_1's rmse: 1.05244
[1662]	training's rmse: 1.05033	valid_1's rmse: 1.05243
[1663]	training's rmse: 1.05031	valid_1's rmse: 1.05241
[1664]	training's rmse: 1.05031	valid_1's rmse: 1.05241
[1665]	training's rmse: 1.05028	valid_1's rmse: 1.05239
[1666]	training's rmse: 1.05027	valid_1's rmse: 1.05238
[1667]	training's rmse: 1.05026	valid_1's rmse: 1.

[1797]	training's rmse: 1.04883	valid_1's rmse: 1.0511
[1798]	training's rmse: 1.04883	valid_1's rmse: 1.0511
[1799]	training's rmse: 1.04882	valid_1's rmse: 1.05109
[1800]	training's rmse: 1.04881	valid_1's rmse: 1.05109
[1801]	training's rmse: 1.0488	valid_1's rmse: 1.05108
[1802]	training's rmse: 1.04878	valid_1's rmse: 1.05106
[1803]	training's rmse: 1.04877	valid_1's rmse: 1.05105
[1804]	training's rmse: 1.04876	valid_1's rmse: 1.05104
[1805]	training's rmse: 1.04875	valid_1's rmse: 1.05103
[1806]	training's rmse: 1.04875	valid_1's rmse: 1.05103
[1807]	training's rmse: 1.04873	valid_1's rmse: 1.05102
[1808]	training's rmse: 1.04873	valid_1's rmse: 1.05101
[1809]	training's rmse: 1.04872	valid_1's rmse: 1.051
[1810]	training's rmse: 1.0487	valid_1's rmse: 1.05099
[1811]	training's rmse: 1.0487	valid_1's rmse: 1.05099
[1812]	training's rmse: 1.04869	valid_1's rmse: 1.05098
[1813]	training's rmse: 1.04868	valid_1's rmse: 1.05097
[1814]	training's rmse: 1.04867	valid_1's rmse: 1.05096

[1944]	training's rmse: 1.04747	valid_1's rmse: 1.04991
[1945]	training's rmse: 1.04746	valid_1's rmse: 1.0499
[1946]	training's rmse: 1.04745	valid_1's rmse: 1.04989
[1947]	training's rmse: 1.04744	valid_1's rmse: 1.04989
[1948]	training's rmse: 1.04743	valid_1's rmse: 1.04988
[1949]	training's rmse: 1.04741	valid_1's rmse: 1.04986
[1950]	training's rmse: 1.04741	valid_1's rmse: 1.04986
[1951]	training's rmse: 1.04739	valid_1's rmse: 1.04984
[1952]	training's rmse: 1.04737	valid_1's rmse: 1.04983
[1953]	training's rmse: 1.04736	valid_1's rmse: 1.04982
[1954]	training's rmse: 1.04736	valid_1's rmse: 1.04982
[1955]	training's rmse: 1.04735	valid_1's rmse: 1.0498
[1956]	training's rmse: 1.04733	valid_1's rmse: 1.04979
[1957]	training's rmse: 1.04731	valid_1's rmse: 1.04977
[1958]	training's rmse: 1.04731	valid_1's rmse: 1.04977
[1959]	training's rmse: 1.0473	valid_1's rmse: 1.04976
[1960]	training's rmse: 1.04727	valid_1's rmse: 1.04973
[1961]	training's rmse: 1.04725	valid_1's rmse: 1.0