In [1]:
import pandas as pd
import numpy as np

In [27]:
def revise_input_data(data):
    S = pd.read_csv('Final_Files/Top_Smat.csv', index_col=0)
    R = pd.read_csv('Final_Files/Rmat.csv', index_col=0)
    M = pd.read_csv('proj4_data/movies.dat', sep='::', engine = 'python', encoding="ISO-8859-1", header=None)
    M.columns = ['MovieID', 'Title', 'Genres']
    
    movieIDs = list(map(lambda x: int(x[1:]), S.index))
    expanded_data = M.loc[M.MovieID.isin(movieIDs)]
    expanded_data = expanded_data.drop(columns=['Genres'])
    expanded_data['Rate'] = np.nan
    
    data = pd.DataFrame(data)
    for i in range(len(expanded_data)):
        ID = expanded_data.iloc[i]['MovieID']
        if ID in data.MovieID.values:
            expanded_data.loc[expanded_data['MovieID']==ID,'Rate'] = data[data['MovieID']==ID]['Rate'].values[0]
    
    expanded_data.index = expanded_data['MovieID'].values
    expanded_data = expanded_data.drop(columns=['MovieID', 'Title'])
    expanded_data.index = map(lambda x: 'm'+str(x), expanded_data.index)
    return expanded_data['Rate']

In [28]:
def myIBCF(w):
    S = pd.read_csv('Final_Files/Top_Smat.csv', index_col=0)
    R = pd.read_csv('Final_Files/Rmat.csv', index_col=0)
    M = pd.read_csv('proj4_data/movies.dat', sep='::', engine = 'python', encoding="ISO-8859-1", header=None)
    M.columns = ['MovieID', 'Title', 'Genres']
    
    w_with_rate = w.dropna()
    rated_movies = w_with_rate.index
    predicted_ratings = w.copy()
    all_movies = S.index
    
    for movie in all_movies:
        if movie not in rated_movies:
            S_movie = S.loc[movie] # Similarity of the movie
            S_movie_index = S_movie.dropna().index # Only select movies with similarities
            useful_movies = S_movie_index.intersection(rated_movies) # Further choose movies with both similarities and ratings
            # print(useful_movies)
            # print(w['m2196'])
            
            U = np.sum(S_movie[useful_movies]*predicted_ratings[useful_movies])
            D = np.sum(S_movie[useful_movies])

            if D!=0:
                predicted_ratings[movie] = U/D
    
    predicted_ratings = predicted_ratings.drop(rated_movies)
    res = predicted_ratings.sort_values(ascending=False)[:10]
    res.index = map(lambda x: int(x[1:]), res.index)
    res = res.sort_index()
    
    final_df = M[M['MovieID'].isin(res.index)]
    final_df = final_df.drop(columns=['Genres'])
    final_df['Rate'] = res.values
    final_df = final_df.sort_values(by=['Rate'], ascending=False)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [32]:
def generate_output(res):
    output = []
    for i in range(len(res)):
        output.append({'MovieID': res.iloc[i]['MovieID'], 'Title': res.iloc[i]['Title']})
    return output

In [33]:
def System_2(input_data):
    revised_input = revise_input_data(input_data) # convert input data from list of dictionary to series
    recommendations = myIBCF(revised_input) # get recommendations
    final_output = generate_output(recommendations) # format the recommendations to be a list of dictionary
    return final_output

In [35]:
data = [{'MovieID': 1, 'Title': 'Toy Story (1995)', 'Rate': 4},
        {'MovieID': 2, 'Title': 'Jumanji (1995)', 'Rate': 4},
        {'MovieID': 3, 'Title': 'Grumpier Old Men (1995)', 'Rate': 2},
        {'MovieID': 4, 'Title': 'Waiting to Exhale (1995)', 'Rate': 1},
        {'MovieID': 5, 'Title': 'Father of the Bride Part II (1995)', 'Rate': 5},
        {'MovieID': 6, 'Title': 'Heat (1995)', 'Rate': 1},
        {'MovieID': 7, 'Title': 'Sabrina (1995)', 'Rate': 5},
        {'MovieID': 8, 'Title': 'Tom and Huck (1995)', 'Rate': 4},
        {'MovieID': 9, 'Title': 'Sudden Death (1995)', 'Rate': 3},
        {'MovieID': 10, 'Title': 'GoldenEye (1995)', 'Rate': 4},
        {'MovieID': 11, 'Title': 'American President, The (1995)', 'Rate': 1},
        {'MovieID': 12, 'Title': 'Dracula: Dead and Loving It (1995)', 'Rate': 1},
        {'MovieID': 13, 'Title': 'Balto (1995)', 'Rate': 3},
        {'MovieID': 14, 'Title': 'Nixon (1995)', 'Rate': 3},
        {'MovieID': 15, 'Title': 'Cutthroat Island (1995)', 'Rate': 4},
        {'MovieID': 16, 'Title': 'Casino (1995)', 'Rate': 5},
        {'MovieID': 17, 'Title': 'Sense and Sensibility (1995)', 'Rate': 3},
        {'MovieID': 18, 'Title': 'Four Rooms (1995)', 'Rate': 1},
        {'MovieID': 19, 'Title': 'Ace Ventura: When Nature Calls (1995)', 'Rate': 3},
        {'MovieID': 20, 'Title': 'Money Train (1995)', 'Rate': 2}]

solution = System_2(data)
solution

[{'MovieID': 391, 'Title': "Jason's Lyric (1994)"},
 {'MovieID': 1436, 'Title': 'Falling in Love Again (1980)'},
 {'MovieID': 1596, 'Title': 'Career Girls (1997)'},
 {'MovieID': 1811, 'Title': 'Niagara, Niagara (1997)'},
 {'MovieID': 1859, 'Title': 'Taste of Cherry (1997)'},
 {'MovieID': 2063, 'Title': 'Seventh Heaven (Le Septième ciel) (1997)'},
 {'MovieID': 2172,
  'Title': 'Strike! (a.k.a. All I Wanna Do, The Hairy Bird) (1998)'},
 {'MovieID': 2534, 'Title': 'Avalanche (1978)'},
 {'MovieID': 3166, 'Title': 'Brenda Starr (1989)'},
 {'MovieID': 3282, 'Title': 'Different for Girls (1996)'}]