In [27]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader

from surprise import NormalPredictor
from surprise.model_selection import cross_validate
import pandas as pd


In [None]:

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

## Start fitting the data. 

---

The data need dataframe with three columns "user_id","item_id","score" (which will be the click count)

In [32]:
train, test = pd.read_csv("./clean_data/train.csv") ,pd.read_csv("./clean_data/test.csv")

In [33]:
userProjectCount = train.groupby(["userCode","project_id"]).size().reset_index(name='counts')
userProjectCount.head()

Unnamed: 0,userCode,project_id,counts
0,00005aba-5ebc-0821-f5a9-bacca40be125,5342,1
1,0000bae7-6233-d7cc-2a6d-48aa70fe8ad4,5678,1
2,0000c576-e929-19eb-615a-349ec3b4709b,6461,1
3,0000d196-6385-80b8-661d-b7427042daa3,9040,1
4,0000e1e2-f595-0ae7-860f-fcc07dcb116e,6709,1


In [140]:
userProjectCount["userCode"].value_counts()

de89bac5-57c6-ecfb-184d-cc4e973c31ac    3700
31bb9bf0-8ad5-3e50-f334-e6c7de89bac5    2495
b883abba-dc3e-f30d-a571-98fffeffe88f    1254
3f0b3f64-468d-8411-efee-debb3facd532    1177
987779b7-042c-ddea-e43c-5132045fe84e     967
5a5c8ed0-dee9-beff-0a91-4c65d31e3744     827
eec0a125-d5ab-f692-c501-4713c35c756d     772
e400406a-b73d-3e78-d37d-53fcddebccaf     538
72bd11b1-1880-2468-a691-78a33f0b3f64     479
02e5a154-8e62-a827-db93-bf15c35da2a5     473
3d3b8010-60e7-f02b-2ba7-b3e96814cd43     304
de72fec5-e0c8-2a3c-78ae-eb033b3180b7     257
0fb6c4c9-da0f-4522-6694-e62ac5134d91     202
8258ff86-189d-a4b9-c9f4-105d495167f2     191
5beda6e8-dd84-35a6-7613-84aac7f9cda9     187
9b2669b6-fa23-8dfc-2684-c296b9069f15     186
8bc40ff4-237b-29dd-7760-455390454b7b     183
983c73d4-20ae-bbdc-62e5-be824d9f4c64     176
67a95419-5eeb-8ba5-eac0-6390e9dfbcc1     170
894a4600-3006-06c5-bff9-cd37a3618bc1     169
1aecd8e5-f02b-ba5c-e865-49da43ad8f22     150
57e59e2d-c706-59a8-61c6-efe32f6bf35c     150
72b7fa59-b

In [42]:
# The columns must correspond to user id, item id and ratings (in that order).
train_data = Dataset.load_from_df(userProjectCount[['userCode', 'project_id', 'counts']], reader)
train = train_data.build_full_trainset()

In [36]:
# We can now use this dataset as we please, e.g. calling cross_validate
reader = Reader(rating_scale=(userProjectCount.counts.min(), userProjectCount.counts.max()))
cross_validate(NormalPredictor(), data, cv=2)

{'fit_time': (0.6060791015625, 0.7657840251922607),
 'test_mae': array([1.25583424, 1.2696019 ]),
 'test_rmse': array([2.35527119, 2.33087065]),
 'test_time': (4.847996950149536, 5.349361181259155)}

In [43]:
#Now we will train on train set
algo = SVD()
algo.fit(train)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24d860811d0>

In [47]:
userid = "00005aba-5ebc-0821-f5a9-bacca40be125"
itemid = "5342"
actual_rating = 4
algo.predict(userid, itemid)

Prediction(uid='00005aba-5ebc-0821-f5a9-bacca40be125', iid='5342', r_ui=None, est=1.5555045406358934, details={'was_impossible': False})

## Now try to predict each project_id the user want


In [80]:
# Get unique userCode and their visited project id
test_df = test.groupby("userCode").agg({"project_id":lambda x: set(x)}).reset_index()
test_df.head()

Unnamed: 0,userCode,project_id
0,00023eed-55ce-8642-3e27-9bcaf777bddb,{3604}
1,000b10b8-843c-6de6-1edd-cff7ed081a63,{6207}
2,000ee534-2db8-6293-53fa-4030608c9878,{6230}
3,0011ca72-ea19-b6f9-6f67-9e650183b00c,{7697}
4,00132ba8-fff1-3b8c-8836-f0ecede89ccf,{7890}


In [144]:
proj_main_df = pd.read_csv("./data/project_main.csv", delimiter=";")
project_ids = proj_main_df["project_id"].unique()
print(len(project_ids))
project_ids[:5]

5379


array([ 44,  24, 161,  73, 219], dtype=int64)

In [146]:
def get_top_n(userId, n=7):
    project_scores = []
    for project_id in project_ids:
        score = algo.predict(userId, project_id).est
        project_scores.append((project_id,score))
    project_scores.sort(key=lambda x: -x[1])
    #print(project_scores[:n])
    return " ".join([str(proj_id) for proj_id,_ in project_scores[:n]])
print(get_top_n("fe95ffd1-e5d9-9b5f-a5fa-522271110006"))
get_top_n("de89bac5-57c6-ecfb-184d-cc4e973c31ac")

4458 5858 4173 8688 5634 5404 7118


'4458 5858 4201 8748 8717 8091 7694'

In [147]:
%%time
test_df["recommended"] = test_df["userCode"].apply(get_top_n,1000)

Wall time: 19min 48s


In [158]:
userProjectCount["userCode"].head().apply(get_top_n)

0    8977 9407 9369 4458 5932 8834 3588
1    8977 6367 9369 7722 9028 3774 6934
2    9091 8977 6367 6258 7131 7722 6934
3    5858 4458 5067 3774 8153 8748 5287
4    9387 4791 3278 8056 8545 2396 9214
Name: userCode, dtype: object

## Measure Mean-Average-Precision

In [148]:
test_df.to_csv("./clean_data/test_predicted_result.csv",index=False)

In [159]:
test_df.head()

Unnamed: 0,userCode,project_id,recommended,AP
0,00023eed-55ce-8642-3e27-9bcaf777bddb,{3604},9471 3774 7722 7839 7118 9407 6536,0.0
1,000b10b8-843c-6de6-1edd-cff7ed081a63,{6207},9471 3774 7722 7839 7118 9407 6536,0.0
2,000ee534-2db8-6293-53fa-4030608c9878,{6230},9471 3774 7722 7839 7118 9407 6536,0.0
3,0011ca72-ea19-b6f9-6f67-9e650183b00c,{7697},9471 3774 7722 7839 7118 9407 6536,0.0
4,00132ba8-fff1-3b8c-8836-f0ecede89ccf,{7890},9471 3774 7722 7839 7118 9407 6536,0.0


In [163]:
test_df.sort_values("recommended")

Unnamed: 0,userCode,project_id,recommended,AP
3216,1aecd8e5-f02b-ba5c-e865-49da43ad8f22,"{2560, 2561, 3072, 3073, 2564, 2565, 3076, 410...",1104 6505 6337 5858 8748 8896 3278,0.0
7470,3dfef978-8e34-bcdf-f1fe-454880fddfc2,{8736},1553 5538 6367 3521 2560 3790 9394,0.0
27547,e1603dce-ee2b-3f4b-86d7-6388f260d39b,{2437},1908 3521 8880 7343 7776 2719 7026,0.0
21575,b165703c-f328-4dfe-c04d-e52ba3b86f4c,"{8326, 5673, 4201, 5323, 5396, 2328}",1965 6367 6008 8253 5424 6152 8888,0.0
16065,83cf0a89-a38c-039c-f2e9-915a3e8713ad,{6511},1968 9118 9330 5996 9271 9028 9369,0.0
22162,b6215908-6477-6224-789c-310bff38214e,{6444},2100 3790 5067 8689 4458 8796 7839,0.0
8240,4465a79e-3665-ee71-c8e8-1df0e9e63fc1,{8812},2108 9091 8082 9330 2560 9382 5538,0.0
9539,4ef24b1b-8d81-2d55-5d76-d594d363a482,{2219},2219 7798 7839 7118 9028 8977 5803,0.0
28536,e94ea847-91ad-b8d8-cd6c-96b9b9445139,{3796},2266 3774 7722 6364 8688 6367 9118,0.0
18071,94d35395-b62b-3033-f11f-82b0150899b2,{9483},2266 3774 9113 9118 6934 7361 7839,0.0


In [150]:
def score_AP(correct, recommended, n=7):
    recommended = recommended.split(" ")
    assert(len(set(recommended))==len(recommended)) # Recommended should be unique    
    precisions = []
    correct_score = 1
    for idx, recommend_item in enumerate(recommended):
        if recommend_item not in correct: precisions.append(0)
        else:             
            precisions.append(correct_score/(idx+1))
            correct_score+=1
            
        
    return sum(precisions)/min(len(correct),n)
score_AP(["1","2","7","9"], "5 2 1",3)

0.38888888888888884

In [153]:
%%time
test_df["AP"] = test_df.apply(lambda x: score_AP(x["project_id"],x["recommended"],7),axis=1)

Wall time: 911 ms


In [154]:
#หนทางอีกยาวไกล
test_df["AP"].mean()

0.0