# Theory

## Concepts
- Used to classsify new data points based on "distance" to know data
- Create clusters of data
- Simpliest ML model (but still counts as supervised learning)

# Code

In [1]:
# Importando dados fornecidos pelo curso
import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head(10)

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3
5,22,377,1
6,244,51,2
7,166,346,1
8,298,474,4
9,115,265,2


In [2]:
# Agrupando os dados por filme e calculando a média das avaliações e tamanho de amostra
import numpy as np

movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, 'mean']})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [4]:
# Cria um novo dataframe, mas com os ratings normalizados (entre 0 e 1)
# Essa parte é importante para o calculo das distancias de forma mais precisa
# evitando que escalas diferentes afetem a classificação
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [5]:
# Agrupa os dados acima com outras informações (titulo e genero)
movieDict = {}
with open(r'u.item', encoding="ISO-8859-1") as f:
    temp = ''
    for line in f:
        #line.decode("ISO-8859-1")
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))


In [7]:
# Cada filme fica assim, titulo, generos, quantidade de avaliações e rating
movieDict[1]

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)

In [9]:
from scipy import spatial

# Calcula, entre 2, filmes a distancia dos vetores 
# de genero e a diferença entre a popularidade, 
# retornando a soma desses 2 escalares, outro método 
# poderia ser a distancia euclidiana de todos os parametros
def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
    
ComputeDistance(movieDict[2], movieDict[4])

0.8004574042309892

In [10]:
import operator

# Computa a distancia entre dado filme e todos os outros,
# guadando os K mais proximos (aka o nome do algoritmo)
def getNeighbors(movieID, K):
    distances = []
    # Itera para cada filme no dataset
    for movie in movieDict:
        # Assegura que o filme em questão não vai ser comparado com ele mesmo (distancia 0)
        if (movie != movieID):
            # armazena a distancia e o titulo do filme
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    # Ordena a lista em relação a distancia
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    # Seleciona os K vizinhos mais proximos
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 10
avgRating = 0
# Chama a função para K vizinhos
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

# Média da métrica utilizada (rating)
avgRating /= K

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


In [11]:
avgRating

3.3445905900235564

In [12]:
movieDict[1]

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)

## Activity

In [13]:
import operator
# Mudar o K apenas muda a quantidade de vizinhos armazenados (dai o nome do algoritmo, K vizinhos mais proximos)
K = 2
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= K

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537


In [17]:
import operator
# Mudar o K apenas muda a quantidade de vizinhos armazenados (dai o nome do algoritmo, K vizinhos mais proximos)
K = 3
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= K

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975


## Practice

In [15]:
import operator
# Outro parametro interessante para se alterar é o filme escolhido
# Nesse caso o filme é taxi driver 
K = 5
avgRating = 0
neighbors = getNeighbors(23, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= K

Primal Fear (1996) 3.601123595505618
Apt Pupil (1998) 4.1
Firm, The (1993) 3.2781456953642385
Volcano (1997) 2.808219178082192
Sling Blade (1996) 4.198529411764706


In [16]:
avgRating

3.5972035761433503