## Content Based Example

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

### Setting sample data 

In [15]:
data = {
    'title': ['The Matrix', 'Titanic', 'The Avengers', 'Shrek', 'Toy Story', 'Avatar', 'The Dark Knight'],
    'genres': ['Action Sci-Fi', 'Romance Drama', 'Action Adventure Sci-Fi', 'Animation Comedy', 'Animation Adventure Comedy', 'Action Adventure Sci-Fi', 'Action Drama']
}
df = pd.DataFrame(data)


### Dataset example

In [16]:
df.head(n=len(df))

Unnamed: 0,title,genres
0,The Matrix,Action Sci-Fi
1,Titanic,Romance Drama
2,The Avengers,Action Adventure Sci-Fi
3,Shrek,Animation Comedy
4,Toy Story,Animation Adventure Comedy
5,Avatar,Action Adventure Sci-Fi
6,The Dark Knight,Action Drama


### TF-IDF
* Esta técnica é usada aqui para converter os gênereos em uma matriz numérica 

In [17]:
tfidf = TfidfVectorizer(stop_words='english')

df['genres'] = df['genres'].fillna('')

tfidf_matrix = tfidf.fit_transform(df['genres'])

print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20 stored elements and shape (7, 8)>
  Coords	Values
  (0, 0)	0.5231891078946888
  (0, 7)	0.6026081468833456
  (0, 5)	0.6026081468833456
  (1, 6)	0.7694487573949885
  (1, 4)	0.6387085483562188
  (2, 0)	0.44811451141971304
  (2, 7)	0.5161373798563944
  (2, 5)	0.5161373798563944
  (2, 1)	0.5161373798563944
  (3, 2)	0.7071067811865475
  (3, 3)	0.7071067811865475
  (4, 1)	0.5172690941469574
  (4, 2)	0.60515811332262
  (4, 3)	0.60515811332262
  (5, 0)	0.44811451141971304
  (5, 7)	0.5161373798563944
  (5, 5)	0.5161373798563944
  (5, 1)	0.5161373798563944
  (6, 0)	0.5959400344623714
  (6, 4)	0.803028938037097


### Computing Cosine Similarity
* Medir a similaridade entre dois vetores
    *  Fórmula: a similaridade cosseno é calculada como o cosseno do ângulo entre dois vetores, variando de -1 a 1.
    * Linear Kernel: calcula a similaridade entre todos os pares de filmes

In [18]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(cosine_sim.shape)

(7, 7)


### Get Recommendations
* Função para obter a recomendação de filmes semelhantes com base na similaridade dos cossenos

In [19]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = df.index[df['title'] == title].tolist()[0]

    # pega as pontuações de similaridade de todos os filmes 
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Ordenar os filmes com base nas pontuações de similaridade
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:6]

    movie_indices = [i[0] for i in sim_scores]
    similarities = [i[1] for i in sim_scores]

    # Retorna os 5 filmes mais similares ordenados
    return df['title'].iloc[movie_indices], similarities

def create_recommendations_table(titles):
    recommendations = {'Movie': [], 'Recommended Movie': [], 'Similarity': []}
    for title in titles:
        recommended_titles, similarities = get_recommendations(title)
        for rec_title, sim in zip(recommended_titles, similarities):
            recommendations['Movie'].append(title)
            recommendations['Recommended Movie'].append(rec_title)
            recommendations['Similarity'].append(sim)
    return pd.DataFrame(recommendations)

In [20]:
titles_to_recommend = ['The Matrix', 'Shrek']

# Criar o DataFrame com as recomendações
recommendations_df = create_recommendations_table(titles_to_recommend)

# Exibir o DataFrame com as recomendações
recommendations_df


Unnamed: 0,Movie,Recommended Movie,Similarity
0,The Matrix,The Avengers,0.856506
1,The Matrix,Avatar,0.856506
2,The Matrix,The Dark Knight,0.311789
3,The Matrix,Titanic,0.0
4,The Matrix,Shrek,0.0
5,Shrek,Toy Story,0.855823
6,Shrek,The Matrix,0.0
7,Shrek,Titanic,0.0
8,Shrek,The Avengers,0.0
9,Shrek,Avatar,0.0
