In [1]:
import pandas as pd
import numpy as np
import csv
import datetime 
import time

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from src import settings

retail_rocket_root_path = settings.DATA_ROOT_PATH + '/site_data/retail_rocket/' 
recobell_root_path = settings.DATA_ROOT_PATH + "/site_data/recobell/"
ml_1m_root_path = settings.DATA_ROOT_PATH + "/site_data/ml-1m/"

In [2]:
ratings = pd.read_csv(ml_1m_root_path + 'raw_data/ratings.dat', delimiter='::', header=None, engine='python', names=['uid', 'itemid', 'rating', 'server_time'])
ratings.head()
# lay 5 gia tri tuong tac dau tien, moi tuong tac lay: uid, itemid, rating, server_time.

Unnamed: 0,uid,itemid,rating,server_time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [16]:
print(ratings.count())

uid            1000209
itemid         1000209
rating         1000209
server_time    1000209
dtype: int64


In [17]:
item_df = pd.read_csv(ml_1m_root_path + 'raw_data/movies.dat', delimiter='::', header=None, engine='python', names=['itemid', 'title', 'genres'])
item_df.head()

Unnamed: 0,itemid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
print(item_df.count())
print "\n"
new_list_item= item_df.loc[:,'itemid'].tolist()
print "Type data: ", type(new_list_item)
print "Length list item:", len(new_list_item)
print "Top 5 item:", new_list_item[:5]

itemid    3883
title     3883
genres    3883
dtype: int64


Type data:  <type 'list'>
Length list item: 3883
Top 5 item: [1, 2, 3, 4, 5]


In [19]:
print(ratings.count())

uid            1000209
itemid         1000209
rating         1000209
server_time    1000209
dtype: int64


In [20]:
# loai bo cac gia tri ratings trong bo du lieu ratings
# doi voi cac item_id co rating ma khong co trong danh sach cac item (dl sai).
new_rating = ratings[ratings.itemid.isin(item_df.loc[:, 'itemid'].tolist())]
print(new_rating.count())

uid            1000209
itemid         1000209
rating         1000209
server_time    1000209
dtype: int64


In [21]:
with open(ml_1m_root_path + 'i2index.txt') as f:
    for line in f:
        active_item_dict = {int(line.split(',')[0]): int(line.split(',')[1])}
        print (active_item_dict)

{1193: 0}
{661: 1}
{914: 2}
{3408: 3}
{2355: 4}
{1197: 5}
{1287: 6}
{2804: 7}
{594: 8}
{919: 9}
{595: 10}
{938: 11}
{2398: 12}
{2918: 13}
{1035: 14}
{2791: 15}
{2687: 16}
{2018: 17}
{3105: 18}
{2797: 19}
{2321: 20}
{720: 21}
{1270: 22}
{527: 23}
{2340: 24}
{48: 25}
{1097: 26}
{1721: 27}
{1545: 28}
{745: 29}
{2294: 30}
{3186: 31}
{1566: 32}
{588: 33}
{1907: 34}
{783: 35}
{1836: 36}
{1022: 37}
{2762: 38}
{150: 39}
{1: 40}
{1961: 41}
{1962: 42}
{2692: 43}
{260: 44}
{1028: 45}
{1029: 46}
{1207: 47}
{2028: 48}
{531: 49}
{3114: 50}
{608: 51}
{1246: 52}
{1357: 53}
{3068: 54}
{1537: 55}
{647: 56}
{2194: 57}
{648: 58}
{2268: 59}
{2628: 60}
{1103: 61}
{2916: 62}
{3468: 63}
{1210: 64}
{1792: 65}
{1687: 66}
{1213: 67}
{3578: 68}
{2881: 69}
{3030: 70}
{1217: 71}
{434: 72}
{2126: 73}
{3107: 74}
{3108: 75}
{3035: 76}
{1253: 77}
{1610: 78}
{292: 79}
{2236: 80}
{3071: 81}
{902: 82}
{368: 83}
{1259: 84}
{3147: 85}
{1544: 86}
{1293: 87}
{1188: 88}
{3255: 89}
{3256: 90}
{3257: 91}
{110: 92}
{2278: 93}
{24

{279: 2113}
{437: 2114}
{3724: 2115}
{3729: 2116}
{1050: 2117}
{3738: 2118}
{2939: 2119}
{451: 2120}
{2940: 2121}
{1209: 2122}
{78: 2123}
{79: 2124}
{82: 2125}
{1082: 2126}
{3841: 2127}
{1161: 2128}
{630: 2129}
{1099: 2130}
{3929: 2131}
{640: 2132}
{3007: 2133}
{3115: 2134}
{2505: 2135}
{1707: 2136}
{166: 2137}
{2596: 2138}
{2672: 2139}
{2678: 2140}
{1416: 2141}
{2913: 2142}
{9: 2143}
{2803: 2144}
{1919: 2145}
{637: 2146}
{973: 2147}
{1939: 2148}
{2929: 2149}
{3112: 2150}
{3069: 2151}
{1926: 2152}
{2690: 2153}
{2434: 2154}
{2439: 2155}
{1904: 2156}
{40: 2157}
{2995: 2158}
{43: 2159}
{2979: 2160}
{1187: 2161}
{3124: 2162}
{8: 2163}
{692: 2164}
{2170: 2165}
{2196: 2166}
{2448: 2167}
{990: 2168}
{2625: 2169}
{3444: 2170}
{240: 2171}
{1858: 2172}
{1004: 2173}
{2879: 2174}
{464: 2175}
{3767: 2176}
{3768: 2177}
{704: 2178}
{869: 2179}
{2357: 2180}
{577: 2181}
{801: 2182}
{2092: 2183}
{3054: 2184}
{1460: 2185}
{1551: 2186}
{2354: 2187}
{1489: 2188}
{885: 2189}
{1495: 2190}
{1582: 2191}
{3400:

In [22]:
with open(ml_1m_root_path + 'i2index.txt') as f:
    active_item_dict = {int(line.split(',')[0]): int(line.split(',')[1]) for line in f}

print(len(active_item_dict))
print (active_item_dict)
# item_df = item_df[item_df.itemid.isin(active_item_list)]
# item_df.count()

3706
{1: 40, 2: 573, 3: 1333, 4: 450, 5: 1334, 6: 255, 7: 576, 8: 2163, 9: 2143, 10: 843, 11: 1256, 12: 2262, 13: 1683, 14: 490, 15: 1850, 16: 328, 17: 417, 18: 1958, 19: 1851, 20: 2427, 21: 145, 22: 1967, 23: 1883, 24: 359, 25: 504, 26: 1065, 27: 2830, 28: 1522, 29: 364, 30: 976, 31: 2743, 32: 376, 33: 3364, 34: 381, 35: 1773, 36: 383, 37: 3178, 38: 1375, 39: 221, 40: 2157, 41: 239, 42: 441, 43: 2159, 44: 1078, 45: 1276, 46: 1686, 47: 242, 48: 25, 49: 3083, 50: 259, 52: 260, 53: 3418, 54: 1867, 55: 1980, 56: 3168, 57: 2769, 58: 453, 59: 3322, 60: 852, 61: 3191, 62: 704, 63: 2265, 64: 1404, 65: 1219, 66: 1719, 67: 2820, 68: 2600, 69: 2235, 70: 909, 71: 2319, 72: 2225, 73: 478, 74: 1414, 75: 3075, 76: 996, 77: 2929, 78: 2123, 79: 2124, 80: 2821, 81: 1224, 82: 2125, 83: 2632, 84: 2226, 85: 821, 86: 1097, 87: 2201, 88: 822, 89: 1878, 90: 3420, 92: 1688, 93: 1890, 94: 1434, 95: 130, 96: 3102, 97: 2493, 98: 2656, 99: 2730, 100: 1926, 101: 2417, 102: 2878, 103: 3188, 104: 179, 105: 481, 106:

In [9]:
genre_dict = {'Action': 0, 
              'Adventure': 1,
'Animation':2
, '''Children's''': 3
, 'Comedy': 4
, 'Crime': 5
, 'Documentary': 6
, 'Drama': 7
, 'Fantasy': 8
, 'Film-Noir': 9
, 'Horror': 10
, 'Musical': 11
, 'Mystery': 12
, 'Romance': 13
, 'Sci-Fi': 14
, 'Thriller': 15
, 'War': 16
, 'Western': 17}
genre_dict
# list category for list movie.

{'Action': 0,
 'Adventure': 1,
 'Animation': 2,
 "Children's": 3,
 'Comedy': 4,
 'Crime': 5,
 'Documentary': 6,
 'Drama': 7,
 'Fantasy': 8,
 'Film-Noir': 9,
 'Horror': 10,
 'Musical': 11,
 'Mystery': 12,
 'Romance': 13,
 'Sci-Fi': 14,
 'Thriller': 15,
 'War': 16,
 'Western': 17}

In [23]:
item_df.head()

Unnamed: 0,itemid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
# apply len tat cac cac phan tu cua item_df voi ham vo danh
y = item_df.apply(lambda x: [active_item_dict[x[0]], x[2]], axis=1, result_type='expand')
y.head()
# doi voi moi item trong item_df => thay id moi, categories
# 40,Animation|Children's|Comedy
# 573,Adventure|Children's|Fantasy
# 1333,Comedy|Romance
# 450,Comedy|Drama

KeyError: (51, u'occurred at index 50')

In [15]:
# write data frame into csv file
y.to_csv(ml_1m_root_path + 'i2pcat.txt', header=False, index=False)

# New task preprocessing for retail rocket dataset.

In [34]:
# events_df = pd.read_csv(retail_rocket_root_path + 'raw_data/events.csv')
# category_tree_df = pd.read_csv(retail_rocket_root_path + 'raw_data/category_tree.csv')

# item_properties file 
# 417 053 unique items
# every row in the file has corresponding timestamp
# item properties file contain timestamp column because all of them are time dependent
# since properties may change over time, e.g. price, category, etc

# value of categoryid properties contain item category identifier
# value of available properties contains availability of the item ( 1: available, 0: ow)
# All numerical values were marked with "n" char at the beginning, and have 3 digits precision after decimal point


item_properties_1_df = pd.read_csv(retail_rocket_root_path + 'raw_data/item_properties_part1.csv')
item_properties_2_df = pd.read_csv(retail_rocket_root_path + 'raw_data/item_properties_part2.csv')
# timestamp,itemid,property,value

# 1435460400000,460429,categoryid,1338
# 1441508400000,206783,888,1116713 960601 n277.200
# 1439089200000,395014,400,n552.000 639502 n720.000 424566
# 1431226800000,59481,790,n15360.000
# 1431831600000,156781,917,828513
# 1436065200000,285026,available,0
# 1434250800000,89534,213,1121373
# 1431831600000,264312,6,319724
# 1433646000000,229370,202,1330310
# 1434250800000,98113,451,1141052 n48.000
# 1439089200000,450113,888,1038400 45956 n504.000

In [35]:
item_properties_1_df[:11]

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
5,1436065200000,285026,available,0
6,1434250800000,89534,213,1121373
7,1431831600000,264312,6,319724
8,1433646000000,229370,202,1330310
9,1434250800000,98113,451,1141052 n48.000


In [17]:
item_properties_2_df[:11]

Unnamed: 0,timestamp,itemid,property,value
0,1433041200000,183478,561,769062
1,1439694000000,132256,976,n26.400 1135780
2,1435460400000,420307,921,1149317 1257525
3,1431831600000,403324,917,1204143
4,1435460400000,230701,521,769062
5,1433041200000,286407,202,820407
6,1438484400000,256368,888,437265 1296497 n24.000 229949 651738 285933
7,1437879600000,307534,888,150169 212349 1095303 824508 1257235 153900
8,1439089200000,102767,888,5135 790941 1055803 221748 122132 n12.000 1135...
9,1431831600000,215180,71,1096621


In [36]:
events_df = pd.read_csv(retail_rocket_root_path + 'raw_data/events.csv')
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [37]:
# loc function: access a group of row and columns by labels.

# item_properties_1_df.property: dua ra item_id va properties cua item_properties_1_df
# 0           categoryid
# 1                  888
# 2                  400
# 3                  790
# 4                  917

# item_properties_1_df.property == 'categoryid'
#  0            True
#  1           False
#  2           False
#  3           False
#  4           False
#  5           False
#  6           False

# item_properties_1_df.loc[item_properties_1_df.property == 'categoryid', ['timestamp', 'itemid', 'value']]

# lấy ở item_properties_1_df voi property = categoryid voi cac gia tri tren cac cot timestam, itemid va value

# giam gia tri tu 10999999 xuong con 426305 ban ghi

# Value of the "categoryid" property contains item category identifier.

item_cat_1 = item_properties_1_df.loc[item_properties_1_df.property == 'categoryid', ['timestamp', 'itemid', 'value']]
item_cat_1.head()

Unnamed: 0,timestamp,itemid,value
0,1435460400000,460429,1338
140,1432436400000,281245,1277
151,1435460400000,35575,1059
189,1437274800000,8313,1147
197,1437879600000,55102,47


In [39]:
item_cat_2 = item_properties_2_df.loc[item_properties_2_df.property == 'categoryid', ['timestamp', 'itemid', 'value']]
item = item_cat_1.append(item_cat_2)
# y = item.drop_duplicates(['itemid', 'value']).groupby(['itemid', 'value']).count()
# y.head()
# # temp = item.loc[item.itemid == 25]
# # temp.sort_values(by=['timestamp'])
# # y = item.groupby(['itemid', 'value']).count()
# y.loc[y.timestamp == 1]
print "Record retrive from 1:", len(item_cat_1)
print "Record retrive from 1:", len(item_cat_2)
print "Total records:", len(item)

Record retrive from 1: 426305
Record retrive from 1: 361909
Total records: 788214


In [27]:
item[:10]

Unnamed: 0,timestamp,itemid,value
0,1435460400000,460429,1338
140,1432436400000,281245,1277
151,1435460400000,35575,1059
189,1437274800000,8313,1147
197,1437879600000,55102,47
213,1433041200000,397079,619
237,1436670000000,265036,1228
254,1437879600000,124459,1277
310,1437879600000,350508,546
325,1439089200000,221365,1226


In [40]:
# y.loc[:, ['itemid', 'value']] # type: pd.DataFrame

# xoa cac ban ghi trung nhau tren cac cot item_id va value, va loc lai lay cac gia tri tren 2 cot itemid va value.
z = item.drop_duplicates(['itemid', 'value']).loc[:, ['itemid', 'value']]
print z.head()
print "\n"
print "Length of data after remove duplicate:", len(z)
# y.to_dict()
# z.to_csv(retail_rocket_root_path + 'temp.txt', index=False, header=False)

     itemid value
0    460429  1338
140  281245  1277
151   35575  1059
189    8313  1147
197   55102    47


Length of data after remove duplicate: 442432


# DEMO DROP_DUPLICATE.

In [37]:
raw_data = {'first_name': ['Jason', 'Jason', 'Jason','Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Miller1', 'Miller2','Ali', 'Milner', 'Cooze'], 
        'age': [42, 42, 1111111, 36, 24, 73], 
        'preTestScore': [4, 4, 4, 31, 2, 3],
        'postTestScore': [25, 25, 25, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
print (df)

  first_name last_name      age  preTestScore  postTestScore
0      Jason    Miller       42             4             25
1      Jason   Miller1       42             4             25
2      Jason   Miller2  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70


In [38]:
df.drop_duplicates(['first_name'])

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25
3,Tina,Ali,36,31,57
4,Jake,Milner,24,2,62
5,Amy,Cooze,73,3,70


In [39]:
df.drop_duplicates(['first_name','last_name'], keep='last') # xoa ban ghi giong nhau o ca first va last name.

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25
1,Jason,Miller1,42,4,25
2,Jason,Miller2,1111111,4,25
3,Tina,Ali,36,31,57
4,Jake,Milner,24,2,62
5,Amy,Cooze,73,3,70


# END DEMO.

In [41]:
z.loc[z.itemid == 124]

Unnamed: 0,itemid,value


In [41]:
# convert i2index.txt into dictionary
with open(retail_rocket_root_path + 'i2index.txt') as f: 
    active_item_dict = {int(line.split(',')[0]): int(line.split(',')[1]) for line in f}

# get list key from this dict.
active_item_list = active_item_dict.keys()

In [45]:
# remove cac phan tu trong z khong nam trong active item list (cac key lay tu file i2index.txt)
s = z[z.itemid.isin(active_item_list)]
# khong quan tam den value.
print 's head'
print s.head()

y = s.loc[:, 'itemid'].drop_duplicates() # xoa item_id trung nhau va loc tren cot itemid
print "y data type:",type(y)
print (y.head())
print len(y)


s head
     itemid value
189    8313  1147
408   96660   498
504  244437   438
804  313481  1613
836  366108   498
y data type: <class 'pandas.core.series.Series'>
189      8313
408     96660
504    244437
804    313481
836    366108
Name: itemid, dtype: int64
83274


In [43]:
# active item dict
print active_item_dict[75509]
print 'max key:', (max([int(s) for s in active_item_dict.keys()]))
print 'max value:',(max([int(s) for s in active_item_dict.values()]))

44419
max key: 466864
max value: 83273


In [46]:
# apply len tat ca cac phan tu cua s voi ham vo danh lambda.
# s.head()
#      itemid value
# 189    8313  1147
# 408   96660   498
# 504  244437   438
# 804  313481  1613
# 836  366108   498

# active item dict
# 393218: 27821,
#  393219: 30596,
#  393220: 2768,
#  152918: 36242,
#  6: 5054,

# thay the cac phan tu tung hang trong s tuong ung boi active_item_dict[x[0]], x[1]
# t moi thu duoc bao gom key cua active_item_dict va value cua s tuong ung.
t = s.apply(lambda x: [active_item_dict[x[0]], x[1]], axis=1, result_type='expand')
t.head()
# s.to_csv(retail_rocket_root_path + 'i2pcat.txt', index=False, header=False)

Unnamed: 0,0,1
189,75509,1147
408,42912,498
504,13362,438
804,33257,1613
836,60227,498


In [18]:
t.to_csv(retail_rocket_root_path + 'i2pcat.txt', index=False, header=False)
# i2pcat: item_id va categories tuong ung cua item do.
# voi item_id la id cu cua item khi chua mapping

In [33]:
print y.head() # y bao gom id cua ban ghi va tuong ung id item.
x = y.to_list() # lay cac item_id trong y
print len(x)
# print x[:5]

189      8313
408     96660
504    244437
804    313481
836    366108
Name: itemid, dtype: int64
83274


In [37]:
# get even_df tu trong danh sach cac events_df ma itemid do nam trong x
new_event_df = events_df[events_df.itemid.isin(x)]
# print(new_event_df.count())
# print(events_df.count())
new_event_df.to_csv(retail_rocket_root_path + 'raw_data/new_events.csv', index=False)
new_event_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
4,1433221337106,951259,view,367447,
6,1433221923240,810725,view,443030,
7,1433223291897,794181,view,439202,


In [48]:
category_tree_df = pd.read_csv(retail_rocket_root_path + 'raw_data/category_tree.csv')
category_tree_df.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [40]:
category_tree_df['categoryid']

0       1016
1        809
2        570
3       1691
4        536
5        231
6        542
7       1146
8       1140
9       1479
10        83
11       688
12       257
13      1640
14       963
15       412
16       948
17       934
18       148
19        12
20      1459
21      1039
22       877
23        28
24       912
25       600
26       177
27       344
28       419
29       192
        ... 
1639     628
1640    1020
1641    1248
1642    1500
1643    1317
1644     857
1645     551
1646    1343
1647     113
1648    1205
1649    1542
1650    1698
1651     760
1652     466
1653      72
1654    1095
1655     959
1656    1308
1657     679
1658     662
1659     460
1660     112
1661    1354
1662     486
1663     456
1664      49
1665    1112
1666    1336
1667     689
1668     761
Name: categoryid, Length: 1669, dtype: int64

In [49]:
# lay ra cac phan tu trong category_tree_df ma parent la na. ????
category_tree_df.loc[category_tree_df.parentid.isna()]

Unnamed: 0,categoryid,parentid
5,231,
589,791,
861,1490,
939,431,
1111,755,
1142,378,
1208,1579,
1211,1394,
1322,659,
1329,1057,


# Below is recobell data preprocessing

# After that it change to retail_rocket data

In [4]:
print (ml_1m_root_path)
explicit_recobell_df = pd.read_csv(ml_1m_root_path + '_explicit.clean.txt', header=None, names=['uid', 'itemid', 'time', 'rating'])
print(explicit_recobell_df.head())
# active_item_df = pd.read_csv(recobell_root_path + 'item_repr.txt', header=None, names=['itemid', 'repr'], delimiter=',', quotechar='|', skiprows=1)
# active_item_df.head()

/home/tucng/Desktop/RS/project/src/site_data/ml-1m/
   uid  itemid       time  rating
0    1    1193  978300760       5
1    1    3408  978300275       4
2    1    2355  978824291       5
3    1    1287  978302039       5
4    1    2804  978300719       5


In [5]:
# load list key for i2index
with open(ml_1m_root_path + 'i2index.txt') as f:
    active_item_dict = {int(line.split(',')[0]): int(line.split(',')[1]) for line in f}
active_item_list = active_item_dict.keys()
print(len(active_item_list))

3706


In [6]:
# list key index from u2index.
with open(ml_1m_root_path + 'u2index.txt') as f:
    active_user_dict = {int(line.split(',')[0]): int(line.split(',')[1]) for line in f}
active_user_list = active_user_dict.keys()
print(len(active_user_list))

6040


In [7]:
# loai bo tuong tac ma user khong co trong active_user_list
explicit_recobell_df = explicit_recobell_df[explicit_recobell_df.uid.isin(active_user_list)]

# loai bo tuong tac ma item khong co trong active item list
explicit_recobell_df = explicit_recobell_df[explicit_recobell_df.itemid.isin(active_item_list)]

print 'explicit recobell df before:'
print (explicit_recobell_df.head())

# thay the rating ban dau boi user_id, item_id moi trong u2index va i2index.
explicit_recobell_df = explicit_recobell_df.apply(lambda x: [active_user_dict[x[0]], active_item_dict[x[1]]], axis=1, result_type='expand')
print(explicit_recobell_df.head())

explicit recobell df before:
   uid  itemid       time  rating
0    1    1193  978300760       5
1    1    3408  978300275       4
2    1    2355  978824291       5
3    1    1287  978302039       5
4    1    2804  978300719       5
   0  1
0  0  0
1  0  3
2  0  4
3  0  6
4  0  7


# user item explicit using for trainning 

In [11]:
explicit_recobell_df.to_csv(ml_1m_root_path + 'ui_ex.txt', index=False, header=False)

# generate item representation.

In [9]:
# using item_repr.txt was made before to generate vector for this item.
active_item_df = pd.read_csv(recobell_root_path + 'item_repr.txt', header=None, names=['itemid', 'repr'], delimiter=',', quotechar='|', skiprows=1)
print(active_item_df.head())

   itemid                                               repr
0    3506  [[17, 0.25], [80, 0.25], [219, 0.25], [464, 0....
1    2293  [[16, 0.25], [46, 0.25], [335, 0.25], [1892, 0...
2    6555  [[9, 0.25], [83, 0.25], [153, 0.25], [1854, 0....
3    8448  [[6, 0.25], [82, 0.25], [249, 0.25], [832, 0.25]]
4    1818  [[1, 0.25], [38, 0.25], [286, 0.25], [841, 0.25]]
/home/tucng/Desktop/RS/project/src/site_data/recobell/


In [15]:
# make a dictionary for each item 
# each of element is a key of item and value of categories
item_repr_dict = {}
for _, row in active_item_df.iterrows():
    item_repr_dict[row[0]] = row[1]

print(item_repr_dict[0])
print(item_repr_dict[3506])

[[14, 0.25], [73, 0.25], [426, 0.25], [1620, 0.25]]
[[17, 0.25], [80, 0.25], [219, 0.25], [464, 0.25]]


In [17]:
explicit_recobell_df['num'] = 1

In [19]:
explicit_recobell_df.columns = ['uid', 'itemid', 'num']
explicit_recobell_df.head()

Unnamed: 0,uid,itemid,num
0,0,0,1
1,0,3,1
2,0,4,1
3,0,6,1
4,0,7,1


In [22]:
import json

def json_string_to_dense_vector(string_vector, dimensions):
    """
    Chuyen vector thua dang string: [[9, 0.010176822], [118, 0.010578092], [264, 0.020403702]]
    ve vector thuong K chieu cua numpy
    """
    K = dimensions

    vector = np.zeros(K, dtype=np.float32)
    if string_vector is None or len(string_vector) < 1:
        return vector

    x = json.loads(string_vector)
    
    for element in x:
        vector[element[0]] = element[1]
    return vector

In [23]:
def dense_vector_to_list_sparse_vector(vector):
    """
    Chuyen vector np ve vector thua dang string: [[9, 0.010176822], [118, 0.010578092], [264, 0.020403702]]
    """
    K = len(vector)
    threshold = 1e-6

    list_temp = []
    for i in range(0, K):
        if vector[i] > threshold:
            list_temp.append([i, float(vector[i])])
    return list_temp

In [24]:
def list_sparse_vector_to_json_string(sv):
    # type: (list) -> str
    """Convert vector in the form list of (int, float) to string

    Parameters
    ----------
    sv : list of (int, float)
        sparse_vector to be converted

    Examples
    ---------
    >>> sparse_vector = [(1, 0.123232), (2, 5.34234234)]
    >>> list_sparse_vector_to_json_string(sparse_vector)
    '[[1, 0.123232], [2, 5.34234234]]'
    """

    sum = 0
    list_sparse = []

    for item in sv:
        sum += item[1]

    for item in sv:
        list_sparse.append([int(item[0]), float(item[1] / sum)])

    return json.dumps(list_sparse)


In [None]:
def add_avg()

In [27]:
explicit_recobell_df.head()

Unnamed: 0,uid,itemid,num
0,0,0,1
1,0,3,1
2,0,4,1
3,0,6,1
4,0,7,1


In [25]:
# apply into each row of explicit_recobell_df
t = explicit_recobell_df.apply(lambda x: [x[0], json_string_to_dense_vector([x[1]], 1939), x[2]], axis=1, result_type='expand').groupby(by=['0'])
print(t.head())

TypeError: ('expected string or buffer', u'occurred at index 0')

In [22]:
# item_recobell_df = pd.read_csv(recobell_root_path + 'raw_data/site_product.csv000', header=None, names=['itemid', 'price', 'cat1', 'cat2', 'cat3', 'cat4', 'brandid'])
# view_recobell_df = pd.read_csv(recobell_root_path + 'raw_data/tiny_site_view_log.csv000', header=None, names=['server_time', 'device', 'session_id', 'uid', 'itemid'])
order_recobell_df = pd.read_csv(recobell_root_path + 'raw_data/tiny_site_order_log.csv000', header=None, names=['server_time', 'device', 'session_id', 'uid', 'itemid', 'order_id', 'quantity'])
view_recobell_df.head()
# preprocessing with recobell datasets

Unnamed: 0,server_time,device,session_id,uid,itemid
0,2016-08-13 12:38:47.512,MA,bgF9eb8ynC,b15f9e5,d9886b1
1,2016-08-14 13:41:34.30,MA,yzbfOf5b4e,8007201,e295b87
2,2016-08-14 13:37:57.682,MA,yzbfOf5b4e,8007201,cedc044
3,2016-08-11 19:04:41.939,MA,YBpaGQxzg3,99055ba,4f3ecec
4,2016-08-11 18:57:11.63,MA,YBpaGQxzg3,99055ba,ee3f827


In [3]:
item_recobell_df.nunique()

itemid     422516
price       24607
cat1           18
cat2           80
cat3          342
cat4         1499
brandid     10584
dtype: int64

In [18]:
active_item_df = pd.read_csv(recobell_root_path + 'item_repr.txt', header=None, names=['itemid', 'repr'], delimiter=',', quotechar='|')
active_item_df.head()

Unnamed: 0,itemid,repr
0,5919d61,"[[17, 0.25], [80, 0.25], [219, 0.25], [464, 0...."
1,1073c34,"[[16, 0.25], [46, 0.25], [335, 0.25], [1892, 0..."
2,559fc56,"[[9, 0.25], [83, 0.25], [153, 0.25], [1854, 0...."
3,67da1f9,"[[6, 0.25], [82, 0.25], [249, 0.25], [832, 0.25]]"
4,b5eaeaa,"[[1, 0.25], [38, 0.25], [286, 0.25], [841, 0.25]]"


In [33]:
z = active_item_df[active_item_df.itemid.isin(active_item_list)]
z = z.drop_duplicates(['itemid'])
z.count()

itemid    118293
repr      118293
dtype: int64

In [34]:
t = z.apply(lambda x: [active_item_dict[x[0]], x[1]], axis=1, result_type='expand')
t.head()

Unnamed: 0,0,1
0,3506,"[[17, 0.25], [80, 0.25], [219, 0.25], [464, 0...."
1,2293,"[[16, 0.25], [46, 0.25], [335, 0.25], [1892, 0..."
2,6555,"[[9, 0.25], [83, 0.25], [153, 0.25], [1854, 0...."
3,8448,"[[6, 0.25], [82, 0.25], [249, 0.25], [832, 0.25]]"
4,1818,"[[1, 0.25], [38, 0.25], [286, 0.25], [841, 0.25]]"


In [37]:
import csv
t.to_csv(recobell_root_path + 'item_repr.txt', index=False, header=False, quotechar='|', quoting=csv.QUOTE_ALL)

In [28]:
y = active_item_df.loc[:, 'itemid'].drop_duplicates()
y.count()

422516

In [29]:
x = y.tolist()
print(len(x))

422516


In [30]:
new_view_df = view_recobell_df[view_recobell_df.itemid.isin(x)]
new_order_df = order_recobell_df[order_recobell_df.itemid.isin(x)]
new_view_df.count()

server_time    4521505
device         4521505
session_id     4521505
uid            4521505
itemid         4521505
dtype: int64

In [31]:
new_view_df.to_csv(recobell_root_path + 'raw_data/new_tiny_site_view_log.csv000', index=False, header=False)
new_order_df.to_csv(recobell_root_path + 'raw_data/new_tiny_site_order_log.csv000', index=False, header=False)

In [27]:
z.head()

Unnamed: 0,itemid,repr
0,5919d61,"[[17, 0.25], [80, 0.25], [219, 0.25], [464, 0...."
1,1073c34,"[[16, 0.25], [46, 0.25], [335, 0.25], [1892, 0..."
2,559fc56,"[[9, 0.25], [83, 0.25], [153, 0.25], [1854, 0...."
3,67da1f9,"[[6, 0.25], [82, 0.25], [249, 0.25], [832, 0.25]]"
4,b5eaeaa,"[[1, 0.25], [38, 0.25], [286, 0.25], [841, 0.25]]"


In [None]:
z.to_csv(recobell_root_path + 'item_repr.txt', index=False, header=False)

In [61]:
a =  item_recobell_df.loc[:, ['cat2', 'cat3']]
x =  item_recobell_df.loc[:, ['cat2']]

In [63]:
y = a.drop_duplicates().groupby(['cat3']).count()
y.loc[y.cat2 > 1]

Unnamed: 0_level_0,cat2
cat3,Unnamed: 1_level_1
0ea22c8,2
142a302,2
23bebb0,2
3919b77,2
44650a9,7
512f480,2
58961c3,2
5b889c3,2
7efeb2d,3
9ea0ca2,2


In [59]:
item_recobell_df[item_recobell_df.cat2 == 'bd0b0a5'].drop_duplicates(['cat1', 'cat2'])

Unnamed: 0,itemid,price,cat1,cat2,cat3,cat4,brandid
9,eb32384,67410,d9d0ba9,bd0b0a5,f1c99bd,af4f9f8,7a9f4fc
114195,257677b,39000,66dd73d,bd0b0a5,58961c3,a457db3,fd23080


In [38]:
b = set(item_recobell_df.cat1) | set(item_recobell_df.cat2) | set(item_recobell_df.cat3) | set(item_recobell_df.cat4)

In [39]:
len(b)

1838

In [10]:
c = list(set(item_recobell_df.cat1))

In [1]:
len(c)
c

NameError: name 'c' is not defined

# Pre-process for movielens data

In [3]:
ratings.head()

Unnamed: 0,uid,itemid,rating,server_time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
count_rating = ratings.groupby(['uid'])[['rating']].count()
count_rating[count_rating['rating'] < 20]

Unnamed: 0_level_0,rating
uid,Unnamed: 1_level_1


In [5]:
mean_rating = ratings.groupby(['uid']).agg({'rating': 'mean'})
mean_rating['uid'] = mean_rating.index
mean_rating[mean_rating['rating'] < 2]

# ratings[ratings['uid'] == 2744]

Unnamed: 0_level_0,rating,uid
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
2744,1.304348,2744
3598,1.015385,3598
4349,1.962963,4349
4486,1.058824,4486
4539,1.815126,4539
5334,1.927273,5334
5850,1.844828,5850


In [7]:
count_rating['rating'].max()

2314

In [8]:
count_rating['rating'].min()

20

In [48]:
mean_rating
with open(ml_1m_root_path + 'u2index.txt') as f:
    active_user_dict = {int(line.split(',')[0]): int(line.split(',')[1]) for line in f}
active_user_list = active_user_dict.keys()
print(len(active_user_list))
new_mean_rating = mean_rating.apply(lambda x: pd.Series([active_user_dict[x['uid']], x['rating']], index=['uid', 'rating']), axis=1)

6040


In [55]:
new_mean_rating['uid'] = new_mean_rating['uid'].apply(lambda x: int(x))

In [6]:
mean_rating.to_csv(ml_1m_root_path + 'user_mean_rating.txt', header=False, index=False, quotechar='|', quoting=csv.QUOTE_ALL)