<a href="https://colab.research.google.com/github/yuehaoshi/Movie_Recommendation_System/blob/main/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Aggregation

In [None]:
def prepare_dataset(data_ori):
    # Split Movies and TV Shows
    data_movie = data_ori[data_ori['type'] == 'Movie']
    data_tv = data_ori[data_ori['type'] == 'TV Show']
    data_movie.drop(columns='type',axis=1,inplace=True)
    data_tv.drop(columns='type',axis=1,inplace=True)
    # Country
    data_movie = prepare_country(data_movie)
    data_tv = prepare_country(data_tv)
    # Director, Country, Cast
    data_movie = one_hot(data_movie, 'director')
    data_movie = one_hot(data_movie, 'country')
    data_movie = one_hot(data_movie, 'cast')
    data_tv = one_hot(data_tv, 'cast') 
    data_tv = one_hot(data_tv, 'country')
    # Rating
    data_movie = prepare_rating(data_movie)
    data_tv = prepare_rating(data_tv)
    # Duration
    data_tv = prepare_duration(data_tv,False)
    data_movie = prepare_duration(data_movie,True)
    # Genre
    data_movie, movie_g = prepare_genre(data_movie)
    data_tv, tv_g = prepare_genre(data_tv)
    # Release_year, Date_added
    data_movie = prepare_release_add(data_movie)
    data_tv = prepare_release_add(data_tv)

    return data_movie, data_tv

In [None]:
data_movie, data_tv = prepare_dataset(data_ori)

In [None]:
def dist(vec1,vec2):
    return np.linalg.norm(vec1-vec2)
    # return vec1.dot(vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

def recommend(name,data_movie,data_tv,data_ori, rec_num=5):
    is_movie = data_movie["title"].isin([name])
    is_tv = data_tv["title"].isin([name])
    result = pd.DataFrame()    
    if is_movie.any():
        movie_show_id = movie_rec(data_movie, is_movie, rec_num+1)
        for show_id in movie_show_id:
            result = pd.concat((result,data_ori[data_ori['show_id']==show_id]),axis=0)
    elif is_tv.any():
        tv_show_id = tv_rec(data_tv, is_tv, rec_num+1)
        for show_id in tv_show_id:
            result = pd.concat((result,data_ori[data_ori['show_id']==show_id]),axis=0)
    else:
        raise ValueError("Entry is not found in the library.")    
    return result

from queue import PriorityQueue
def movie_rec(data_movie, condition,rec_num):
    data_movie_copy = data_movie.drop(columns=['show_id','description','title'],axis=1)
    input_row = data_movie_copy[condition].squeeze()
    dist_list = PriorityQueue()
    for i in range(data_movie_copy.shape[0]):
        dist_list.put((dist(input_row,data_movie_copy.iloc[i,:]),i))
    result = []
    for j in range(rec_num):
        result.append(dist_list.get()[1])
    show_id_idx = list(data_movie.columns).index("show_id")
    return data_movie.iloc[result,show_id_idx]

def tv_rec(data_tv, condition, rec_num):
    data_tv_copy = data_tv.drop(columns=['director','show_id','description','title'],axis=1)
    input_row = data_tv_copy[condition].squeeze()
    dist_list = PriorityQueue()
    for i in range(data_tv_copy.shape[0]):
        dist_list.put((dist(input_row,data_tv_copy.iloc[i,:]),i))
    result = []
    for j in range(rec_num):
        result.append(dist_list.get()[1])
    show_id_idx = list(data_tv.columns).index("show_id")
    return data_tv.iloc[result,show_id_idx]

In [None]:
recommend('Angamaly Diaries',data_movie,data_tv,data_ori)

In [None]:
recommend('Dive Club',data_movie,data_tv,data_ori)

##Adding NLP

In [None]:
!pip install rake_nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from rake_nltk import Rake

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
def prepare_dataset(data_ori):
    # Split Movies and TV Shows
    data_movie = data_ori[data_ori['type'] == 'Movie']
    data_tv = data_ori[data_ori['type'] == 'TV Show']
    data_movie.drop(columns='type',axis=1,inplace=True)
    data_tv.drop(columns='type',axis=1,inplace=True)
    # Country
    data_movie = prepare_country(data_movie)
    data_tv = prepare_country(data_tv)
    # Director, Country, Cast
    data_movie = one_hot(data_movie, 'director')
    data_movie = one_hot(data_movie, 'country')
    data_movie = one_hot(data_movie, 'cast')
    data_tv = one_hot(data_tv, 'cast') 
    data_tv = one_hot(data_tv, 'country')
    # Rating
    data_movie = prepare_rating(data_movie)
    data_tv = prepare_rating(data_tv)
    # Duration
    data_tv = prepare_duration(data_tv,False)
    data_movie = prepare_duration(data_movie,True)
    # Genre
    data_movie, movie_g = prepare_genre(data_movie)
    data_tv, tv_g = prepare_genre(data_tv)
    # Release_year, Date_added
    data_movie = prepare_release_add(data_movie)
    data_tv = prepare_release_add(data_tv)

    return data_movie, data_tv

In [None]:
data_movie, data_tv = prepare_dataset(data_ori)

In [None]:
#https://www.kaggle.com/nishadjoshi98/visualization-analysis-and-recommendation-system
rake = Rake()
nlp_movie = data_movie[['title','description']]
nlp_tv = data_tv[['title','description']]

In [None]:
nlp_movie['key_notes'] = ''
nlp_tv['key_notes'] = ''
for index,row in nlp_movie.iterrows():
    plot = row['description']   
    rake.extract_keywords_from_text(plot)
    keyword_score = rake.get_word_degrees()
    keyword_score = ' '.join(list(keyword_score.keys()))
    row['key_notes'] = keyword_score
for index,row in nlp_tv.iterrows():
    plot = row['description']   
    rake.extract_keywords_from_text(plot)
    keyword_score = rake.get_word_degrees()
    keyword_score = ' '.join(list(keyword_score.keys()))
    row['key_notes'] = keyword_score
recommend_movie = nlp_movie[['title','key_notes']]
recommend_tv = nlp_tv[['title','key_notes']]
#print(recommend_tv)

cv_movie = CountVectorizer()
count_mat_movie = cv_movie.fit_transform(recommend_movie['key_notes'])
cosine_sim_movie = cosine_similarity(count_mat_movie,count_mat_movie)
cv_tv = CountVectorizer()
count_mat_tv = cv_tv.fit_transform(recommend_tv['key_notes'])
cosine_sim_tv = cosine_similarity(count_mat_tv,count_mat_tv)
#print(cosine_sim_tv)

In [None]:
def dist(vec1,vec2):
    return np.linalg.norm(vec1-vec2)
    # return vec1.dot(vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

def recommend_new(name,data_movie,data_tv,data_ori, rec_num=5):
    is_movie = data_movie["title"].isin([name])
    is_tv = data_tv["title"].isin([name])
    result = pd.DataFrame()    
    if is_movie.any():
        movie_show_id = movie_rec_new(data_movie, is_movie, rec_num+1)
        for show_id in movie_show_id:
            result = pd.concat((result,data_ori[data_ori['show_id']==show_id]),axis=0)
    elif is_tv.any():
        tv_show_id = tv_rec_new(data_tv, is_tv, rec_num+1)
        for show_id in tv_show_id:
            result = pd.concat((result,data_ori[data_ori['show_id']==show_id]),axis=0)
    else:
        raise ValueError("Entry is not found in the library.")    
    return result


print(data_movie.shape[0])
print(cosine_sim_movie.shape)

from queue import PriorityQueue
def movie_rec_new(data_movie, condition,rec_num):
    #data_movie1 = data_movie.head(100)
    data_movie_copy = data_movie.drop(columns=['show_id','description','title'],axis=1)
    input_row = data_movie_copy[condition].squeeze()
    #dist_list = PriorityQueue()
    sim1 = []
    sim2 = []
    idx = np.where(condition == True)[0][0]
    for i in range(data_movie_copy.shape[0]):
        #dist1.append(dist(input_row,data_movie_copy.iloc[i,:]))
        if i != idx:
          #print(input_row)
          #print(data_movie_copy.iloc[i])
          sim1.append(np.dot(input_row,data_movie_copy.iloc[i])/(np.linalg.norm(input_row)*np.linalg.norm(data_movie_copy.iloc[i])))
          sim2.append(cosine_sim_movie[idx][i])
          #print("dist1", dist1)
          #print("dist2", dist2)
          #dist_list.put((dist1 + 50*dist2,i))
    result = []
    #norm1 = np.linalg.norm(dist1)
    sims = []
    for (x, y) in zip(sim1, sim2):
      sims.append(x + y)
    sorted = np.argsort(sims)
    result = sorted[::-1][:rec_num]
    print("sim1", len(sim1))
    print("dists", sims)
    print("result", result)
    #for j in range(rec_num):
        #result.append(dist_list.get()[1])
    show_id_idx = list(data_movie.columns).index("show_id")
    return data_movie.iloc[result,show_id_idx]

def tv_rec_new(data_tv, condition, rec_num):
    data_tv1 = data_tv.head(100)
    data_tv_copy = data_tv1.drop(columns=['director','show_id','description','title'],axis=1)
    input_row = data_tv_copy[condition].squeeze()
    sim1 = []
    sim2 = []
    idx = np.where(condition == True)[0][0]
    for i in range(data_tv_copy.shape[0]):
        if i != idx:
          print(input_row)
          print(data_tv_copy.iloc[i])
          sim1.append(np.dot(input_row,data_tv_copy.iloc[i])/(np.linalg.norm(input_row)*np.linalg.norm(data_tv_copy.iloc[i])))
          sim2.append(cosine_sim_tv[idx][i])
    result = []
    sims = []
    for (x, y) in zip(sim1, sim2):
      sims.append(x + y)
    sorted = np.argsort(sims)
    result = sorted[::-1][:rec_num]
    show_id_idx = list(data_tv.columns).index("show_id")
    return data_tv.iloc[result,show_id_idx]

In [None]:
recommend_new('Sankofa',data_movie,data_tv,data_ori)