## Milena e Thaís - Lightfm Recommender System

In [None]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/316.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm


In [None]:
import numpy as np
import pandas as pd
from lightfm import LightFM
import lightfm.cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.data import Dataset
import sys

### Leitura e Analise inicial dos Dados

In [None]:
content = pd.read_json('/content/content.jsonl', lines=True).drop(columns=['Rated', 'Released', 'Writer', 'Plot', 'Poster', 'Ratings','DVD', 'Production', 'Website', 'Response', 'totalSeasons', 'Season', 'Episode', 'Episode', 'seriesID', 'Type', 'Runtime'])
ratings = pd.read_json('/content/ratings.jsonl', lines=True).drop(columns=['Timestamp'])
targets = pd.read_csv('/content/targets.csv')

In [None]:
content.columns #Listar todas as colunas de content

### Funções Auxiliáres

In [None]:
def generateGenres(content, column):
    #Splitar o genero por , e Substituir ' ' por ''
    genre = content[column].str.replace(' ','').str.split(',')

    #Pega os generos unicos
    uniqueGenres = set()
    for i in genre:
        for j in i:
            uniqueGenres.add(j)

    #Cria duas novas colunas para marcar se o conteudo tem ou nao o genero
    features_vector = []

    for g in uniqueGenres:
        content[g] = content[column].apply(lambda y: g in y)
        features_vector.append(g+':False')
        features_vector.append(g+':True')


    #Adapta as features par o lightfm
    #Se um filme pertence aos gêneros “Ação” e “Aventura”, a lista de strings correspondente terá os elementos “Ação:True” e “Aventura:True”.

    features_matrix = []

    for i in content.to_dict(orient='Records'):
        features_matrix.append([g+':'+str(i[g]) for g in uniqueGenres])


    return features_vector, features_matrix

In [None]:
# Mudar o formato de coluna de string pra lista
def filterColumn(item_features,column,prefix='g:'):
    item_features[column] = item_features[column].str.replace(' ','').str.split(',')
    unique = set()
    for x in item_features[column]:
        for j in x:
            unique.add(j)
    features=[]
    for g in unique:
        features.append(prefix+g)

    return features,unique

In [None]:
#Gera as feats de lightfm para todos os itens
def genColumn(item_features,column,prefix='g:'):
    feat=[]
    for x in item_features.to_dict(orient='Records'):
        feat.append([prefix+str(g) for g in x[column]])
    return feat

In [None]:
#Transforma os valores pra int na forma de string
def intOrNa(value):
    value=value.replace(',','')
    if value !='N/A':
        return str(round(float(value)))
    else:
        return ''

#Chama intOrNa para todos os valores
def strToInt(item_features,column):
    item_features[column] = item_features[column].apply(lambda rating: intOrNa(rating))
    return [column+':'+str(rating) for rating in item_features[column].drop_duplicates().to_list()]

In [None]:
#Transforma coluna numerica em categórica
def filterBig(df,column):
    df.loc[df[column] > 10**5,column] = 10**5
    df.loc[(df[column] < 10**5) &(df[column] > 10**4),column] = 10**4
    df.loc[(df[column] < 10**4) &(df[column] > 10**3),column] = 10**3
    df.loc[(df[column] < 10**3) &(df[column] > 10**2),column] = 10**2
    df.loc[(df[column] < 10**2),column] = 10**1
    df.loc[df[column].isna(),column] ='N/A'
    df[column]=df[column].astype(str)


In [None]:
#Transforma awards (nominations / wins) em categorico
def awardsFeats(item_features,column,prefix='n:'):
    awards = item_features[column].copy()
    awards[(awards>=1) & (awards<5)] = 1
    awards[(awards>=5) & (awards<10)] = 5
    awards[(awards>=10) & (awards<15)] = 10
    awards[(awards>=15)] = 15
    feats = [prefix+str(x) for x in awards]
    features = [prefix+'0',prefix+'1',prefix+'5',prefix+'10',prefix+'15']
    return features, feats

In [None]:
#Transforma coluna numerica em categórica para BigBoxOffice
def filterBigBoxOffice(df,column):
    df.loc[df[column] > 10**8,column] = 10**8
    df.loc[(df[column] < 10**8) &(df[column] > 10**7),column] = 10**7
    df.loc[(df[column] < 10**7) &(df[column] > 10**6),column] = 10**6
    df.loc[(df[column] < 10**6) &(df[column] > 10**5),column] = 10**5
    df.loc[(df[column] < 10**5) &(df[column] > 10**4),column] = 10**4
    df.loc[(df[column] < 10**4) &(df[column] > 10**3),column] = 10**3
    df.loc[(df[column] < 10**3) &(df[column] > 10**2),column] = 10**2
    df.loc[(df[column] < 10**2),column] = 10**1
    df.loc[df[column].isna(),column] ='N/A'
    df[column]=df[column].astype(str)

### Limpeza dos dados de content

Genre: Transformar os generos em colunas binárias

Director: Transformar coluna de string pra lista

Language: Ignorar linguas poucas conhecidas

Awards: Separar nominations de wins

Metascore: Transformar em int

imdbRating: Transformar em int

imdVotes: Transformar em int

BoxOfficeOg:

In [None]:
#Generate "original version" of some columns before turning them categorical
#these will be used for cold start users as well as to give weight to popularity later
content['imdbVotesOg'] = content['imdbVotes'].copy()
content['imdbRatingOg'] = content['imdbRating'].copy()
content['MetascoreOg'] = content['Metascore'].copy()
content['BoxOfficeOg'] = content['BoxOffice'].copy()
#convert votes to int
strToInt(content,'imdbVotesOg')
content['imdbVotesOg']=pd.to_numeric(content['imdbVotesOg'])

#convert boxoffice to int
content['BoxOfficeOg'] = content['BoxOfficeOg'].replace('[\$,]', '', regex=True)
content['BoxOfficeOg'] = pd.to_numeric(content['BoxOfficeOg'], errors='coerce', downcast='integer')



In [None]:
#Language features, filter languages that appear fewer than 200 times as other
content['Language'] = content['Language'].apply(lambda x: x.split(',')[0].replace(' ',''))
content.loc[content['Language']=='N/A', 'Language'] = "None"
content.loc[content['Language'].value_counts()[content['Language']].values < 200, 'Language'] = "Other"

In [None]:
#country features, filter countries that appear fewer than 200 times as other
content['Country'] = content['Country'].apply(lambda x: x.split(',')[0].replace(' ',''))
content.loc[content['Country']=='N/A', 'Country'] = "None"
content.loc[content['Country'].value_counts()[content['Country']].values < 200, 'Country'] = "Other"

In [None]:
#actors features,
content['Actors'] = content['Actors'].apply(lambda x: x.split(',')[0].replace(' ',''))
content.loc[content['Actors']=='N/A', 'Actors'] = "None"
#content.loc[content['Actors'].value_counts()[content['Actors']].values < 1, 'Country'] = "Other"

In [None]:
#generate nominations column (parse text on awards and transform to numeric)
content['Nominations']=content['Awards'].str.findall(r'[0-9]+ nomination').str.join(",").str.replace(r'[a-zA-Z]+','',regex=True)
content.loc[content['Nominations']=='','Nominations']=0
content['Nominations']=pd.to_numeric(content['Nominations'])

In [None]:
#generate wins column (parse text on awards and transform to numeric), specific award wins not considered
content['Wins']=content['Awards'].str.findall(r'[0-9]+ win').str.join(",").str.replace(r'[a-zA-Z]+','',regex=True)
content.loc[content['Wins']=='','Wins']=0
content['Wins']=pd.to_numeric(content['Wins'])

In [None]:
#genre
genre_features, genre_feat = generateGenres(content, 'Genre')

In [None]:
#awards
wins_features,wins_feats = awardsFeats(content,'Wins','w:')
nom_features,nom_feats = awardsFeats(content,'Nominations','n:')

In [None]:
#director
director_features,_ = filterColumn(content,'Director','dir:')
dir_feat = genColumn(content,'Director','dir:')

In [None]:
#rating features
rating_features = strToInt(content,'imdbRating')
metascore_features = strToInt(content,'Metascore')

In [None]:
#language
language_features = list(content['Language'].unique())

In [None]:
#country
country_features = list(content['Country'].unique())

In [None]:
#actors
#actors_features = list(content['Actors'].unique())

In [None]:
#votes
_=strToInt(content,'imdbVotes')
content['imdbVotes']=pd.to_numeric(content['imdbVotes'])
filterBig(content,'imdbVotes')
votes_features = strToInt(content,'imdbVotes')

In [None]:
#year
#max_value = content['Year'].max()
#min_value = content['Year'].min()

#print(f"Maximum value: {max_value}")
#print(f"Minimum value: {min_value}")

In [None]:
#content['Year'] = content['Year'].str.extract('(\d+)').astype(float)

In [None]:
#content['Year'] =  content['Year'].str.replace(r'\D', '').astype(int)

In [None]:
#year_features = ['year'+':'+str(rating) for rating in content['Year'].drop_duplicates().to_list()]

In [None]:
#year_feats = feats = ['year'+':'+str(x) for x in content['Year']]

In [None]:
#convert boxoffice to int
#content['BoxOffice'] = content['BoxOffice'].replace('[\$,]', '', regex=True)
#content['BoxOffice'] = pd.to_numeric(content['BoxOffice'], errors='coerce', downcast='integer')
#filterBigBoxOffice(content,'BoxOffice')
#box_features = ['BoxOffice'+':'+str(rating) for rating in content['BoxOffice'].drop_duplicates().to_list()]


In [None]:
#Cria as features para cada item para o lightfm, feats comentadas pioraram o recomendador

item_ids = content['ItemId'].to_list()
feat = [(item_ids[i],g) for i,g in enumerate(genre_feat)]
#feat= [(feat[i][0],feat[i][1]+dfeat) for i,dfeat in enumerate(dir_feat)]
feat = [(feat[i][0],feat[i][1]+['imdbRating:'+j]) for i,j in enumerate(content['imdbRating'])]
#feat = [(feat[i][0],feat[i][1]+['Metascore:'+j]) for i,j in enumerate(content['Metascore'])]
feat = [(feat[i][0],feat[i][1]+['imdbVotes:'+j]) for i,j in enumerate(content['imdbVotes'])]
#feat = [(feat[i][0],feat[i][1]+['BoxOffice:'+j]) for i,j in enumerate(content['BoxOffice'])]
feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(wins_feats)]
feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(nom_feats)]
feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(content['Language'])]
feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(content['Country'])]
#feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(year_feats)]
#feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(content['Actors'])]
#feat = [(feat[i][0],feat[i][1]+[w]) for i,w in enumerate(content['Year'])]

In [None]:
#create list of possible features for lightfm
#genre
#director
#ratings
#votes
#language
#wins
#nominations
#countries

#features = genre_features+director_features+rating_features+votes_features+language_features
#features = genre_features+director_features+rating_features+metascore_features+votes_features+wins_features+language_features+box_features
features = genre_features+rating_features+votes_features+wins_features+language_features+country_features+nom_features

In [None]:
# Contrução do lightfm dataset
dataset = Dataset()
dataset.fit((x for x in ratings['UserId'].to_list()),
            (x for x in ratings['ItemId'].to_list()))
num_users, num_items = dataset.interactions_shape()
(interactions, weights) = dataset.build_interactions(((x['UserId'], x['ItemId'])
                                                    for x in ratings.to_dict(orient='records')))
dataset.fit_partial(items=(x for x in content['ItemId'].to_list()),
        item_features=features)
item_feat = dataset.build_item_features(feat)


In [None]:
#Treino do lightfm
model = LightFM(loss='warp', random_state=3, max_sampled=5)
model.fit(interactions, item_features=item_feat, epochs=5)

In [None]:
#Preparo das previsões
user_id_map, user_feature_map, item_id_map, item_feature_map =dataset.mapping()

In [None]:
#Previsões de coldstart value (sort by benchmarks)
def sort_by_features(df):
    predictions = pd.merge(df,content,on='ItemId')
    predictions.sort_values(by=['imdbVotesOg','Nominations','Wins','imdbRatingOg','ItemId'],ascending=[False,False,False,False,True],inplace=True)
    return predictions[['UserId','ItemId']]

In [None]:
#Previsões com lightfm
def sort_by_predictions(df):
    #converter ID de usuários em mapeamento interno lightfm
    users=df['UserId'].apply(lambda u:user_id_map[u]).values
    items=df['ItemId'].apply(lambda i:item_id_map[i]).values

    #prever pontuações
    predictions=model.predict(users,items,item_features=item_feat)
    df['Prediction']=predictions

    #padronizar para permitir a combinação com item_features
    df['Prediction'] = (df['Prediction']-df['Prediction'].mean())/df['Prediction'].std()

    #atribua um peso à popularidade dos itens padronizados no dataset (já que os usuários parecem preferir itens populares)
    df = pd.merge(df,content,on='ItemId')
    df['Prediction'] +=0.25*(df['imdbVotesOg']-df['imdbVotesOg'].mean())/df['imdbVotesOg'].std()
    #df['Prediction'] +=0.125*(df['BoxOfficeOg']-df['BoxOfficeOg'].mean())/df['BoxOfficeOg'].std()

    #retornar previsões de user-item classificadas por rating, decrescente
    df.sort_values(by=['Prediction','ItemId'],ascending=[False,True],inplace=True)
    return df[['UserId','ItemId']]


In [None]:
def predict(to_predict):
    df_by_user={}
    #cria um conjunto de dados para as previsões de cada usuário
    for user, d in to_predict.groupby('UserId'):
        df_by_user[user] = d

    #prever para cada usuário
    for user in df_by_user:
        #coldstart
        if not user in user_id_map:
            df_by_user[user]=sort_by_features(df_by_user[user])
        #not coldstart
        else:
            df_by_user[user]=sort_by_predictions(df_by_user[user])

    #retornar previsões de user-item em userId crescente e ordem de rating decrescente
    return pd.concat(list(df_by_user.values()))

In [None]:
df=predict(targets)

In [None]:
#Arquivo final com as previsões
with open ('/content/output.csv', 'w') as file:
    file.write('UserId,ItemId\n')
    for prediction in df[['UserId','ItemId']].to_dict(orient='records'):
        file.write(prediction['UserId']+','+prediction['ItemId']+'\n')