In [1]:
def recommend_movies(user_id):
    #  -----------------------------------------   IMPORTING LIBRARIES ---------------------------------------------------
    
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import LabelEncoder
    from sklearn.cluster import KMeans
    from apyori import apriori
# ---------------------------------------------------------------------------  
    import warnings
    warnings.filterwarnings("ignore")
    
#  ------------------------------------------   IMPORTING DATASETS -------------------------------------------------
       # rating dataset
    column_names = ["user_id", "movie_id", "rating", "timestamp"]
    rating_df = pd.read_csv("ml-100k/u.data", names = column_names, sep = "\t")
    rating_df = rating_df.drop("timestamp", axis = 1)    # dropping timestamp column because it is not relevant
    rating_df.head()
    
    # movie dataset
    
    movie_df = pd.read_csv("ml-100k/u.item", sep = "\|", header = None)
    movie_df = movie_df[[0,1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]
    movie_df.columns = ["movie_id", "movie_name", "release_date", "unknown_genre", "Action", "Adventure", "Animation", 
                        "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", 
                        "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ]
    
    # user dataset
    column_names = ["user_id", "age", "gender", "occupation", "zip_code"]
    user_df = pd.read_csv("ml-100k/u.user", names = column_names, sep = "\|")
    user_df = user_df.drop("zip_code", axis = 1)
    
# -------------------------------------------------- age into categorical ---------------------------------------------------
    age_groups = pd.cut(user_df.age, bins= [5,12,18,24,30,45,60,100], labels= ["Baby", "Kid", "Teen", "Adult",
                                                                               "Professional", "Householder", "Elderly"])
    user_df.insert(2, "age_group", age_groups)
    user_df.drop("age", axis= 1)
    
# -------------------------------- Applying Label encoding to columns Age, Gender and Ocupation---------------------------
    
    # gender column
    le_gender= LabelEncoder()
    user_df['gender']= le_gender.fit_transform(user_df['gender'])
    
    # occupation column
    le_occupation= LabelEncoder()
    user_df['occupation']= le_occupation.fit_transform(user_df['occupation'])
    
    # age column
    le_age= LabelEncoder()
    user_df['age_group']= le_age.fit_transform(user_df['age_group'])
    user_df.drop("age", axis= 1)

# ------------------------------------------  CLUSTERING ---------------------------------------------
    # including age group and removing age
    X = user_df.iloc[:, 2:5].values
    
    from sklearn.cluster import KMeans
    k_means_optimum = KMeans(n_clusters = 4, init = 'k-means++',  random_state=42)
    y = k_means_optimum.fit_predict(X)
    
    user_df['Cluster']  = y
    df1 = user_df[['user_id', 'Cluster']]
    rating_df = pd.merge(rating_df, df1)
    
    movie = movie_df[['movie_id', 'movie_name']]
    rating_df = pd.merge(rating_df, movie, on = "movie_id")

# ------------------------------------------- COLLABORATIVE FILTERING -----------------------------------
    cluster_no = user_df.at[user_id, 'Cluster']
    new_df = rating_df.loc[rating_df['Cluster'] == cluster_no]
    df1 = new_df.groupby('movie_name').mean()['rating']
    df2 = new_df.groupby('movie_name').count()['rating']
    df12 = pd.merge(df1, df2, on = "movie_name")
    df12.rename(columns = {'rating_x':'avg_rating', 'rating_y': 'no_of_rating'}, inplace = True)
    df12  = df12.loc[df12['avg_rating'] >= 4]
    df12 = df12.sort_values(by = 'no_of_rating', ascending = False).head(10)
    df12 = df12.sort_values(by = 'avg_rating', ascending = False)
#      print("Cluster No.", cluster_no)    
    cf_result = df12.index
    
#     -------------------------------------- ECLAT CALCULATIONS --------------------------------------------
    # selecting user ratings having ratings as 3+
    rat = rating_df[rating_df['rating'] >= 3]
    
    # forming a new dataframe where each row represents the movies watched by a specific user in a order
    dct = {}
    for (idx,row) in rat.iterrows():
        user = row[0]
        movie = row[4]
        ans = dct.get(user, [])
        dct[user] = ans + [movie]
    eclat_df =  pd.DataFrame.from_dict(dct, orient='index')
    eclat_df = eclat_df.reset_index()
    eclat_df.isna().sum()
    
    # forming a list of tuple of data-items
    transactions = []
    n = eclat_df.shape[0]  #no. of rows in a dataset
    for i in range(n):
        transactions.append([str(eclat_df.values[i,j]) for j in range(0,510)])
    
    # appling Eclat algorithm
    from apyori import apriori
    rules = apriori(transactions = transactions, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2)
    results = list(rules)
    
    # Improving visualization of raw results
    def inspect(results):
        movie1 = [tuple(result[2][0][0])[0] for result in results]
        movie2 = [tuple(result[2][0][1])[0] for result in results]
        supports = [result[1] for result in results]
        return list(zip(movie1, movie2, supports))
    
    resultsInDataFrame = pd.DataFrame(inspect(results), columns = ['Movie1', 'Movie2', 'Support'])
    ans = resultsInDataFrame
    ans = ans[ans['Movie2'] != "nan"]
    
    user_best_movies = list(rating_df.loc[(rating_df['user_id'] == user_id) and (rating_df['rating'] >= 4)]['movie_name'].values)
    recommended_movies_list = []
    for movie in user_best_movies:
        m = list(ans[ans['Movie1' == movie]].sort_values(by = 'Support').head(1).values)
        recommend_movies_list.append(m)
    recommended_movies_list = list(set(recommended_movies_list))
    
    eclat_result = recommended_movies_list
    
# ---------------------------------------- FINAL RECOMMENDATIONS ----------------------------------------------------
    recommendations = cf_result + eclat_result
    return recommendations

# ----------------------------------------------------------------------------------------------------------------------

In [None]:
ans = recommend_movies(2)
ans

In [None]:
def preprocessing():
#  -----------------------------------------   IMPORTING LIBRARIES ---------------------------------------------------
    
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import LabelEncoder
    from sklearn.cluster import KMeans
    from apyori import apriori
# ---------------------------------------------------------------------------  
    import warnings
    warnings.filterwarnings("ignore")
    
#  ------------------------------------------   IMPORTING DATASETS -------------------------------------------------
       # rating dataset
    column_names = ["user_id", "movie_id", "rating", "timestamp"]
    rating_df = pd.read_csv("ml-100k/u.data", names = column_names, sep = "\t")
    rating_df = rating_df.drop("timestamp", axis = 1)    # dropping timestamp column because it is not relevant
    rating_df.head()
    
    # movie dataset
    
    movie_df = pd.read_csv("ml-100k/u.item", sep = "\|", header = None)
    movie_df = movie_df[[0,1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]
    movie_df.columns = ["movie_id", "movie_name", "release_date", "unknown_genre", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ]
    
    # user dataset
    column_names = ["user_id", "age", "gender", "occupation", "zip_code"]
    user_df = pd.read_csv("ml-100k/u.user", names = column_names, sep = "\|")
    user_df = user_df.drop("zip_code", axis = 1)
   
# -----------------------------------------------------------------------------------------------------------------    
    # age into categorical
    age_groups = pd.cut(user_df.age, bins= [5,12,18,24,30,45,60,100], labels= ["Baby", "Kid", "Teen", "Adult","Professional", "Householder", "Elderly"])
    user_df.insert(2, "age_group", age_groups)
    user_df.drop("age", axis= 1)
    
# ---------------------------    Applying Label encoding to columns Age, Gender and Ocupation---------------------------
    
    # gender column
    le_gender= LabelEncoder()
    user_df['gender']= le_gender.fit_transform(user_df['gender'])
    
    # occupation column
    le_occupation= LabelEncoder()
    user_df['occupation']= le_occupation.fit_transform(user_df['occupation'])
    
    # age column
    le_age= LabelEncoder()
    user_df['age_group']= le_age.fit_transform(user_df['age_group'])
    user_df.drop("age", axis= 1)

# ------------------------------------------  ClUSTERING ---------------------------------------------
    # including age group and removing age
    X = user_df.iloc[:, 2:5].values
    
    from sklearn.cluster import KMeans
    k_means_optimum = KMeans(n_clusters = 4, init = 'k-means++',  random_state=42)
    y = k_means_optimum.fit_predict(X)
    
    user_df['Cluster']  = y
    df1 = user_df[['user_id', 'Cluster']]
    rating_df = pd.merge(rating_df, df1)
    
    movie = movie_df[['movie_id', 'movie_name']]
    rating_df = pd.merge(rating_df, movie, on = "movie_id")

    return (rating_df, user_df, movie_df)

In [None]:
def collaborative_filtering(user_id, rating_df, user_df):
    cluster_no = user_df.at[user_id, 'Cluster']
    new_df = rating_df.loc[rating_df['Cluster'] == cluster_no]
    df1 = new_df.groupby('movie_name').mean()['rating']
    df2 = new_df.groupby('movie_name').count()['rating']
    df12 = pd.merge(df1, df2, on = "movie_name")
    df12.rename(columns = {'rating_x':'avg_rating', 'rating_y': 'no_of_rating'}, inplace = True)
    df12  = df12.loc[df12['avg_rating'] >= 4]
    df12 = df12.sort_values(by = 'no_of_rating', ascending = False).head(10)
    df12 = df12.sort_values(by = 'avg_rating', ascending = False)
    print("Cluster No.", cluster_no)    
    return list(df12.index)

In [None]:
def eclat_recommendations(user_id, rating_df):
    rat = rating_df[rating_df['rating'] >= 3]
    dct = {}
    for (idx,row) in rat.iterrows():
        user = row[0]
        movie = row[4]
        ans = dct.get(user, [])
        dct[user] = ans + [movie]
    eclat_df =  pd.DataFrame.from_dict(dct, orient='index')
    eclat_df = eclat_df.reset_index()
    eclat_df.isna().sum()
    
    transactions = []
    n = eclat_df.shape[0]  #no. of rows in a dataset
    for i in range(n):
        transactions.append([str(eclat_df.values[i,j]) for j in range(0,510)])
    
    from apyori import apriori
    rules = apriori(transactions = transactions, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2)
    results = list(rules)
    
    def inspect(results):
        movie1 = [tuple(result[2][0][0])[0] for result in results]
        movie2 = [tuple(result[2][0][1])[0] for result in results]
        supports = [result[1] for result in results]
        return list(zip(movie1, movie2, supports))
    
    resultsInDataFrame = pd.DataFrame(inspect(results), columns = ['Movie1', 'Movie2', 'Support'])
    ans = resultsInDataFrame
    ans = ans[ans['Movie2'] != "nan"]
    
    user_best_movies = list(rating_df.loc[(rating_df['user_id'] == user_id) and (rating_df['rating'] >= 4)]['movie_name'].values)
    recommended_movies_list = []
    for movie in user_best_movies:
        m = list(ans[ans['Movie1' == movie]].sort_values(by = 'Support').head(1).values)
        recommend_movies_list.append(m)
    recommended_movies_list = list(set(recommended_movies_list))
    
    return recommended_movies_list

In [None]:
def recommend_movie(user_id):
    
    lst1 = collaborative_filtering(user_id, rating_df, user_df)
    lst2 = eclat_recommendations(user_id, rating_df)
    lst = lst1 + lst2
    return lst

In [None]:
recommend_movie(2)