# Colaborative Filtering Using Pearson Correaltion

In [2]:
import numpy as np
import pandas as pd

In [3]:
anime = pd.read_csv("anime.csv")
reviews = pd.read_csv("rating.csv")

In [4]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
reviews.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
len(reviews)

7813737

In [7]:
len(anime)

12294

## Collecting Preferences

###### The method to represent multiple users and their prefeneces can be done by representing each user as a key and the anime and rating of each  as values.

In [8]:
count = len(reviews)
preferences = {}

In [9]:
#Numpy array
review_array = reviews.values

In [10]:
for i in range(count):
    user = review_array[i, 0]
    anime = review_array[i, 1]
    rating = review_array[i, 2]
    
    if user not in preferences.keys():
        preferences[user] = {}
    
    preferences[user][anime] = rating

# Similarity Scores / Pearson Scores 

###### The pearson score tells us how much two people's interests will fit on a straight line. This becomes a more sophisticated way of finding similarity

In [11]:
#similarity based on Euclidian
def sim_pearson(preferences, person1,person2):
    
    similarity = {}
    
    #getting similar anime watched
    
    for anime in preferences[person1]:
        if anime in preferences[person2]:
            similarity[anime] = 1
    
    if len(similarity) == 0:
        return 0
    
    sum1 = np.sum([preferences[person1][anime] for anime in similarity])
    sum2 = np.sum([preferences[person2][anime] for anime in similarity])
    
    sum1sq = np.sum([np.power(preferences[person1][anime],2) for anime in similarity])
    sum2sq = np.sum([np.power(preferences[person2][anime],2) for anime in similarity])
    
    pSum = np.sum([preferences[person1][anime] * preferences[person2][anime] for anime in similarity])
    
    num = pSum - (sum1 * sum2/len(similarity))
    den = np.sqrt((sum1sq - np.power(sum1,2)/len(similarity))*(sum2sq - np.power(sum2,2)/len(similarity)))
    if den == 0:
        return 0
    
    r = num/den
    
    return r

# Ranking the Similar Users 

###### Using the Pearson scores found for each reviewer with respect to user, we will find the reviewers with the top scores and rank them in descending order. 

In [12]:
#ranking the reviewers
def top_matches(preferences, person, n = 5, similarity = sim_pearson):
    
    scores = [(similarity(preferences,person,reviewer),reviewer) for reviewer in preferences if reviewer != person]
    
    scores.sort()
    scores.reverse()
    return scores[:n]

In [13]:
top_match = top_matches(preferences,1)

In [14]:
top_match

[(1.000000000000011, 35755),
 (1.000000000000011, 35508),
 (1.0000000000000084, 32904),
 (1.0000000000000084, 27924),
 (1.0000000000000084, 21376)]

# Recommending Items

###### After finding the most similar reviewers to the user,  finding the weighted scores/ratings of each reviewer for each anime provides a list of anime which the user would like the best.

In [15]:
def recommendations(preferences, person, similarity = sim_pearson):
    total = {}
    simsum = {}
    
    for reviwer in preferences:
        if reviwer == person:
            continue
        sim = similarity(preferences,person,reviwer)
        
        if sim <= 0:
            continue
        for anime in preferences[reviwer]:
            
            if anime not in preferences[person] or preferences[person][anime] == 0:
                total.setdefault(anime,0)
                total[anime] += preferences[reviwer][anime] * sim
                
                simsum.setdefault(anime,0)
                simsum[anime] += sim
                
    ranks = [(total/simsum[anime],anime) for anime,total in total.items()]
    
    ranks.sort()
    ranks.reverse()
    
    return ranks[:10],person

In [16]:
anime = pd.read_csv("anime.csv")
anime_array = anime.values

In [17]:
anime_array[0][0]

32281

In [18]:
top_rec,user = recommendations(preferences,1)

#### Using anime.csv data as a lookup table to fins the anime names using the anime_IDs obtained.

In [19]:
anime_list = []
for i in top_rec:
    for j in range(len(anime_array)):
        if i[1] == anime_array[j][0]:
            anime_list.append(anime_array[j])

#### Listing the Top anime and the likelyness of user liking them

In [22]:
print("Top Recommendations for you:\n")
for i in anime_list:
    print(i[1])

Top Recommendations for you:

Kirin Monoshiri Yakata
Midoriyama Koukou Koushien-hen
Shiroi Zou
Doukyuusei
Dededen
Konna Watashitachi ga Nariyuki de Heroine ni Natta Kekka www (TV)
Trapp Ikka Monogatari Specials
Konna Watashitachi ga Nariyuki de Heroine ni Natta Kekka www
Meitantei Holmes: Mrs. Hudson Hitojichi Jiken no Maki / Dover Kaikyou no Daikuuchuusen no Maki
Jinzou Konchuu Kabutoborg VxV
