In [None]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import scipy
import re
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vectorizer = TfidfVectorizer(min_df=3,  max_features=3000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',
            ngram_range=(1, 3),
            stop_words = 'english')


csv_list = os.listdir("../input/vitube")
csv_list = {x.replace("vitube_table_", "").replace(".csv", ""): "../input/vitube/" + x for x in csv_list}
activity_df = pd.read_csv(csv_list["activities"])
user_df = pd.read_csv(csv_list["users"])
category_df = pd.read_csv(csv_list["categories"])
history_df = pd.read_csv(csv_list["watch_histories"])
history_df.drop(columns=["liked", "disliked"], inplace=True)

video_df = pd.read_csv(csv_list["videos"])

In [None]:
video_df["description"].fillna("", inplace=True)
video_df["description"] = video_df["description"].apply(lambda x: re.sub(r"http\S+", "", x or ""))

features = ["id", "duration", "category_id", "comments", "name", "description"]
vid_info = video_df[features]
vid_info.columns = ["video_id"] + features[1:] 
history_df = history_df.merge(vid_info, on="video_id", how="left")
history_df

In [None]:
useful_feats = ["likes", "dislikes", "views", "duration",
               "category_id", "comments", "name", "description",
               ]

In [None]:
def consine_sim(x1, x2):
    return 1 - scipy.spatial.distance.cosine(x1, x2)
def get_user_activities(user_id, action="like"):
    return activity_df[(activity_df["user_id"] == user_id) & (activity_df["type"] == action)]

def get_last_video(user_id, n=100):
    user_filter = history_df["user_id"] == user_id
    videos = history_df[user_filter]
    return videos

def gen_video_feature(videos):
    """Video should be df"""
    return videos[useful_feats]

def concat_feat(feats, mat):
    feats.fillna(0, inplace=True)
    vec = feats.values
    combined = np.concatenate([vec, mat.A], axis=1)
    row_max = combined.max(axis=0)
#     print(row_max.shape, combined.shape, row_max)
    return combined / row_max[np.newaxis, :]

feats = video_df[useful_feats]
feats["text"] = feats["description"] + " " + feats["name"]
feats["text"].fillna("", inplace=True)
mat = tf_vectorizer.fit_transform(feats["text"])

feats.drop(columns=["name", "description", "text"], inplace=True)
vectors = concat_feat(feats, mat)

In [None]:
def recommend_for(video_id, vectors):
    idx = np.where(video_df["id"] == video_id)[0][0]
    most_similar_with = [
        (i, consine_sim(vectors[idx], vectors[i])) for i in range(len(feats))
    ]
    
    bests = sorted(most_similar_with, reverse=True, key=lambda x: x[1])[0:11]
    
    return [
        (video_df.iloc[best[0]]["id"], video_df.iloc[best[0]]["name"],best[1]) for best in bests
    ]

recommended = recommend_for(71, vectors)
frame = pd.DataFrame(recommended)
frame.columns = ["video_id", "video", "score"]
frame

In [None]:
np.where(mat.A[15] > 0), np.where(vectors[15][6:] > 0)
# video_df[video_df["id"] == 5330]
vectors.shape
# tf_vectorizer.get_feature_names()[:10]

In [None]:
import sklearn
len(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)

In [None]:
np.save("save", vectors)

In [None]:
history_df

In [None]:
!du -sh save.npy