In [1]:
%load_ext autoreload
%autoreload 1
%aimport my

import sys
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import pickle

import my
from my import p

pd.set_option('display.max_rows', 200)
pd.set_option("max_colwidth", 45)
pd.set_option("display.precision", 1)
pd.options.display.float_format = "{:.3f}".format
# pd.set_option("display.max_rows", 5)
# pd.reset_option("display.max_rows")

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
     
 # номер или название эксперимента
# n_exp = '1/'
dir_out = "out/"
dir_data = 'data/'
my.dir_data = dir_data
os.makedirs(dir_out, exist_ok = True)

RANDOM_STATE = 34
np.random.seed(RANDOM_STATE)
N_CPU = os.cpu_count()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
it = pd.read_csv(dir_data + 'bib.item', sep='\t')
it['lang:token'] = 'f4:' + it['lang:token']
it[:3]

Unnamed: 0,item_id:token,ff_year_izd:token,ff_author:token,ff_izd:token,lang:token,clean_lemma_title:token_seq
0,RSL01008600016,f1:2015,f2:av_,f3:izd4,f4:ru,судебный следствие уголовный процесс росс...
1,RSL01004304880,f1:2005,f2:av_,f3:izd0,f4:ru,уральский казачество его роль система рос...
2,RSL07000461043,f1:2015,f2:av0,f3:izd0,f4:no,notitle


In [3]:
uniq_it_features = np.concatenate([it['ff_year_izd:token'].unique(),it['ff_author:token'].unique(),it['ff_izd:token'].unique(),it['lang:token'].unique()])
len(uniq_it_features),uniq_it_features[:10]

(419,
 array(['f1:2015', 'f1:2005', 'f1:1995', 'f1:1970', 'f1:1900', 'f1:1945',
        'f1:1800', 'f2:av_', 'f2:av0', 'f2:av1'], dtype=object))

In [4]:
us = pd.read_csv(dir_data + 'bib.user', sep='\t')
us['ff_read_counts:token'] = 'f4:' + us['ff_read_counts:token'].astype(str)
us[:3]

Unnamed: 0,user_id:token,ff_age:token,ff_gender:token,ff_chit_type:token,ff_read_counts:token
0,300001020830,f1:17,f2:f,f3:no,f4:20
1,300001113642,f1:30,f2:f,f3:no,f4:20
2,300001148466,f1:40,f2:f,f3:no,f4:20


In [5]:
uniq_user_features = np.concatenate([us['ff_age:token'].unique(),us['ff_gender:token'].unique(),us['ff_chit_type:token'].unique(),us['ff_read_counts:token'].unique()])
len(uniq_user_features),uniq_user_features

(15,
 array(['f1:17', 'f1:30', 'f1:40', 'f1:22', 'f1:60', 'f1:14', 'f2:f',
        'f2:m', 'f3:no', 'f3:echb', 'f3:normal', 'f4:20', 'f4:3', 'f4:7',
        'f4:50'], dtype=object))

In [6]:
df = pd.read_csv(dir_data + 'bib.inter', sep='\t')
df[:3]

Unnamed: 0,user_id:token,item_id:token,is_printed:token,timestamp:float
0,100000641403,RSL01004206702,False,1613865600
1,100000641403,RSL01004211574,False,1613865600
2,100000641403,RSL01000769304,False,1616457600


In [7]:
from lightfm import LightFM
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(
        us['user_id:token'].unique(), # all the users
        it['item_id:token'].unique(), # all the items
        user_features = uniq_user_features,
        item_features = uniq_it_features
)

# (user id map, user feature map, item id map, item feature map)
lfm_map = dataset.mapping()

# len(lfm_map[0]),len(lfm_map[1]),len(lfm_map[])
list(map(len,lfm_map))

[16753, 16768, 341312, 341731]

In [8]:
lfm_map = {
    'u_to_ids': lfm_map[0],
    'i_to_ids': lfm_map[2],
}

lfm_map['ids_to_u'] = {v: k for k, v in lfm_map['u_to_ids'].items()}
lfm_map['ids_to_i'] = {v: k for k, v in lfm_map['i_to_ids'].items()}

In [9]:
gen_us_feats = us.itertuples()

user_feats = dataset.build_user_features(
    (t[1],t[2:]) for t in gen_us_feats
)

user_feats

<16753x16768 sparse matrix of type '<class 'numpy.float32'>'
	with 83765 stored elements in Compressed Sparse Row format>

In [10]:
gen_it_feats = it.iloc[:,:5].itertuples()

it_feats = dataset.build_item_features(
    (t[1],t[2:]) for t in gen_it_feats
)

it_feats

<341312x341731 sparse matrix of type '<class 'numpy.float32'>'
	with 1706560 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=0.01,min_df=10,dtype=np.float32,max_features=10000,lowercase=False)

it_text_feats = vec.fit_transform(it['clean_lemma_title:token_seq'])
p(vec.get_feature_names_out()[:20], len(vec.vocabulary_))

it_text_feats

In [12]:
from scipy.sparse import hstack

it_feats = hstack([it_feats, it_text_feats])
del it_text_feats
gc.collect()

0

In [13]:
full_int_mat, _ = dataset.build_interactions(
    df[['user_id:token','item_id:token']].to_numpy()
)

In [14]:
from datetime import datetime

FULL_N_EPOCHS = 60

params = {
        "no_components": 1000,
        "loss": "warp",
        "max_sampled": 100,
        "learning_rate": 0.1,
        "random_state":RANDOM_STATE
    }

# regularisation for both user and item features
# ITEM_ALPHA = 1e-6
# USER_ALPHA = 1e-6

lf = LightFM(**params)

lf.fit(full_int_mat, user_features=user_feats, item_features=it_feats, epochs=FULL_N_EPOCHS,num_threads=N_CPU,verbose=True)

current_time = datetime.now().strftime("%d_%m_%H%M")
my.save_pickle(dir_data+f'lfm_{current_time}.pl',lf, True)

Epoch: 100%|██████████| 60/60 [11:06<00:00, 11.12s/it]


save:  data/lfm_11_09_1701.pl


In [15]:
df1 = df[['user_id:token',	'item_id:token']].copy()
df1.columns = ['chb','sys_numb']

In [16]:
preds = my.recommend(lf, df1, lfm_map, user_feats, it_feats, top_N=20)
preds.to_csv(dir_out+'sub21.csv',sep=';',index=False)

  0%|          | 0/16753 [00:00<?, ?it/s]