In [1]:
import pandas as pd

In [2]:
class GA_func(object):
    def __init__(self):
        self.gene_length = 3649
        self.NUM_CHROME = 20 # 染色體個數
        self.Pc = 0.7 # 交配率 (代表共執行Pc*NUM_CHROME/2次交配)
        self.Pm = 0.1 # 突變率 (代表共要執行Pm*NUM_CHROME*NUM_BIT次突變)

        self.NUM_PARENT = self.NUM_CHROME # 父母的個數
        self.NUM_CROSSOVER = int(self.Pc * self.NUM_CHROME / 2) # 交配的次數
        self.NUM_CROSSOVER_2 = self.NUM_CROSSOVER*2 # 上數的兩倍
        self.NUM_MUTATION = int(self.Pm * self.NUM_CHROME * 3649) # 突變的次數

    # 創建一個初始個體
    def create_individual(self):
        genes = list(range(self.gene_length))
        random.shuffle(genes)
        return genes

    # 計算適應度
    def fitness(self, individual, user, movies, degree_g, degree_d, degree_a):
        total_score = 0
        i = individual[0]
        movie = movies[i]
        score_g, score_d, score_a = 0, 0, 0

        if user['preferences']['genres']:
            for n,g in enumerate(user['preferences']['genres']):
                score_g += (len(set(movie['genres']).intersection([g])))*degree_g[n]
            total_score += score_g/len(user['preferences']['genres']) * 7/3
        if user['preferences']['director']:
            for n,d in enumerate(user['preferences']['director']):
                score_d += (len(set(movie['director']).intersection([d])))*degree_d[n]
            total_score += score_d/len(user['preferences']['director']) * 7/3
        if user['preferences']['actors']:
            for n,a in enumerate(user['preferences']['actors']):
                score_a += (len(set(movie['actors']).intersection([a])))*degree_a[n]
            total_score += score_a/len(user['preferences']['actors']) * 7/3

        total_score += movie['score'] * 2
        total_score += ((movie['year']-1954)/(2022-1954)) * 10
        return total_score

    # 選擇--輪盤式選擇
    def selection(self, population, fitnesses):
        final_a = []
        for i in range(self.NUM_PARENT):
            total_fitness = sum(fitnesses)
            probabilities = [f/total_fitness for f in fitnesses]
            r = random.random() # 0~1的隨機浮點數
            cumulative_probability = 0.0
            for j in range(len(population)):
                cumulative_probability += probabilities[j]
                if r <= cumulative_probability:
                    final_a.append(population[j])
                    break
        return final_a

    # 交配--均勻交配
    def crossover(self, parent):
        a = []
        for i in range(self.NUM_CROSSOVER): 
            child1, child2 = np.full((self.gene_length), -1), np.full((self.gene_length), -1)
            [j, k] = np.random.choice(self.NUM_CHROME, 2, replace=False)
            parent1, parent2 = np.array(parent[j]), np.array(parent[k])
            positions = np.random.randint(low=0, high=2, size=self.gene_length)
            child1[np.where(positions == 1)] = parent1[np.where(positions == 1)]
            child2[np.where(positions == 1)] = parent2[np.where(positions == 1)]

            for i in range(self.gene_length):
                if parent2[i] not in child1:
                    for j in range(self.gene_length):
                        if child1[j] == -1:
                            child1[j] = parent2[i]
                            break
                if parent1[i] not in child2:
                    for j in range(self.gene_length):
                        if child2[j] == -1:
                            child2[j] = parent1[i]
                            break
            a.append(list(child1))
            a.append(list(child2))
        return a

    # 突變
    def mutation(self, individual):
        for k in range(self.NUM_MUTATION):
            r = np.random.randint(self.NUM_CROSSOVER_2)
            i, j = random.sample(range(self.gene_length), 2)
            individual[r][i], individual[r][j] = individual[r][j], individual[r][i]
        return individual

    def sortChrome(self, a, a_fit):
        a_index = range(len(a))
        a_fit, a_index = zip(*sorted(zip(a_fit,a_index), reverse=True))
        return [a[i] for i in a_index], a_fit

    def replace(self, p, p_fit, a, a_fit): # 適者生存
        b = np.concatenate((p,a), axis=0)
        b_fit = p_fit + a_fit
        b, b_fit = self.sortChrome(b, b_fit)
        return b[:self.NUM_CHROME], list(b_fit[:self.NUM_CHROME])

ga_func = GA_func()

# 創建初始
def create_population(size):
    return [ga_func.create_individual() for _ in range(size)]
# 執行基因演算法
def genetic_algorithm(population, user, movies, degree_g, degree_d, degree_a):
    best_individual = None
    best_fitness = -1
    
    for i in range(1):
        # 計算每個個體的適應度
        fitnesses = [ga_func.fitness(individual, user, movies, degree_g, degree_d, degree_a) for individual in population]
        
        best_outputs = [] # 用此變數來紀錄每一個迴圈的最佳解 (new)
        best_outputs.append(np.max(fitnesses)) # 存下初始群體的最佳解 (new)
        
        mean_outputs = [] # 用此變數來紀錄每一個迴圈的平均解 (new)
        mean_outputs.append(np.average(fitnesses)) # 存下初始群體的最佳解 (new)
            
        # 選擇最佳個體
        index = fitnesses.index(max(fitnesses))
        if fitnesses[index] > best_fitness:
            best_individual = population[index]
            best_fitness = fitnesses[index]
        # 進行選擇、交配和突變操作，生成新的個體
        for i in range(1): #NUM_ITERATION
            parent = ga_func.selection(population, fitnesses)
            child = ga_func.crossover(parent)
            offspring = ga_func.mutation(child)
            
            offspring_fitnesses = [ga_func.fitness(individual, user, movies, degree_g, degree_d, degree_a) for individual in offspring]
            population, fitnesses = ga_func.replace(population, fitnesses, offspring, offspring_fitnesses)
            
            best_outputs.append(np.max(fitnesses)) # 存下這次的最佳解 (new)
            mean_outputs.append(np.average(fitnesses)) # 存下這次的平均解 (new)
            if (i+1)%10==0 or i==0:
                print('iteration %d: x = %s, y = %f' %(i, population[0][:5], fitnesses[0]))

    return best_individual, best_outputs, mean_outputs

In [3]:
import csv
import random
import pandas as pd
from tqdm import tqdm
import numpy as np

def run_rs(user, degree_g, degree_d, degree_a):
    credits = pd.read_csv("netflix/credits.csv")
    titles = pd.read_csv("netflix/titles.csv")

    diff = set(titles['id']) - set(credits['id'])
    titles = titles[~titles['id'].isin(diff)]
    titles = titles[titles['type'] == 'MOVIE']
    
    # 設置基因長度，即電影數量
    movies = []
    with open('Movies.csv', mode='r',encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            desc = {}
            desc['title'] = list(row.items())[0][1]
            desc['genres'] = eval(list(row.items())[1][1])
            desc['director'] = eval(list(row.items())[2][1])
            desc['actors'] = eval(list(row.items())[3][1])
            desc['score'] = float(list(row.items())[4][1])
            desc['year'] = int(list(row.items())[5][1])
            movies.append(desc)

    gene_length = len(movies)

    NUM_CHROME = 20 # 染色體個數
    
    # 執行
    population = create_population(NUM_CHROME)
    best_individual, best_outputs, mean_outputs = genetic_algorithm(population, user, movies, degree_g, degree_d, degree_a)
    recommended_movies_id = [movies[i]['title'] for i in best_individual][0]
    
    return movies, recommended_movies_id

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
def content_recommender(movies_data, title):
    df = movies_data[['title', 'genres', 'director', 'actors']]
    genres_encoded = pd.get_dummies(df['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x).str.get_dummies(', '))
    director_encoded = pd.get_dummies(df['director'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x).str.get_dummies(', '))
    actors_encoded = pd.get_dummies(df['actors'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x).str.get_dummies(', '))

    features = pd.concat([director_encoded, actors_encoded, genres_encoded], axis=1)
    similarity = cosine_similarity(features)
    movie_title = movies_data['title']
    indices = pd.Series(movies_data.index, index=movies_data['title'])
    
    idx = indices[title]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    similarity_ = [i[1] for i in sim_scores]
        
    return list(movies_data['title'].iloc[movie_indices].values)

In [5]:
data = pd.read_csv('exp_data.csv', index_col = 0)
data.shape

(50, 4)

In [None]:
movies = []
with open('Movies.csv', mode='r', encoding='UTF-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        desc = {}
        desc['title'] = list(row.items())[0][1]
        desc['genres'] = eval(list(row.items())[1][1])
        desc['director'] = eval(list(row.items())[2][1])
        desc['actors'] = eval(list(row.items())[3][1])
        desc['score'] = float(list(row.items())[4][1])
        desc['year'] = int(list(row.items())[5][1])
        movies.append(desc)
movies_data = pd.DataFrame(movies)
title2ID = {}
for i in range(len(movies)):
    title2ID[movies[i]['title']] = i

result = []
for userID in tqdm(range(len(data))):
    user = eval(data.iloc[userID]['User'])
    degree_g = eval(data.iloc[userID]['degree_g'])
    degree_d = eval(data.iloc[userID]['degree_d'])
    degree_a = eval(data.iloc[userID]['degree_a'])
    
    movies, recommended_movies = run_rs(user, degree_g, degree_d, degree_a)
    top5_rm = content_recommender(movies_data, recommended_movies)
    
    total_score = 0
    top5_rm_ID = [int(title2ID[i]) for i in top5_rm]
    for i in top5_rm_ID:
        movie = movies[i]
        score_g, score_d, score_a = 0, 0, 0
        if user['preferences']['genres']:
            for n,g in enumerate(user['preferences']['genres']):
                score_g += (len(set(movie['genres']).intersection([g])))*degree_g[n]
            total_score += score_g/len(user['preferences']['genres']) * 7/3
        if user['preferences']['director']:
            for n,d in enumerate(user['preferences']['director']):
                score_d += (len(set(movie['director']).intersection([d])))*degree_d[n]
            total_score += score_d/len(user['preferences']['director']) * 7/3
        if user['preferences']['actors']:
            for n,a in enumerate(user['preferences']['actors']):
                score_a += (len(set(movie['actors']).intersection([a])))*degree_a[n]
            total_score += score_a/len(user['preferences']['actors']) * 7/3

        total_score += movie['score'] * 2
        total_score += ((movie['year']-1954)/(2022-1954)) * 10
    result.append(total_score)
print(np.mean(result))