In [1]:
import csv
import random
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
def fitness(self, individual, user, movies, degree_g, degree_d, degree_a):
    total_score = 0
    for i in individual[:5]:
        movie = movies[i]
        score_g, score_d, score_a = 0, 0, 0

        if user['preferences']['genres']:
            for n,g in enumerate(user['preferences']['genres']):
                score_g += (len(set(movie['genres']).intersection([g])))*degree_g[n]
            score_g /= sum(degree_g)
            total_score += score_g * 70/3
        if user['preferences']['director']:
            for n,d in enumerate(user['preferences']['director']):
                score_d += (len(set(movie['director']).intersection([d])))*degree_d[n]
            score_d /= sum(degree_d)
            total_score += score_d * 70/3
        if user['preferences']['actors']:
            for n,a in enumerate(user['preferences']['actors']):
                score_a += (len(set(movie['actors']).intersection([a])))*degree_a[n]
            score_a /= sum(degree_a)
            total_score += score_a * 70/3

        total_score += movie['score'] * 2
        total_score += ((movie['year']-1954)/(2022-1954)) * 10
    return total_score

In [3]:
class GA_func(object):
    def __init__(self):
        self.gene_length = 5
        self.NUM_CHROME = 20 # 染色體個數
        self.Pc = 0.9 # 交配率 (代表共執行Pc*NUM_CHROME/2次交配)
        self.Pm = 0.5 # 突變率 (代表共要執行Pm*NUM_CHROME*NUM_BIT次突變)

        self.NUM_PARENT = self.NUM_CHROME # 父母的個數
        self.NUM_CROSSOVER = int(self.Pc * self.NUM_CHROME / 2) # 交配的次數
        self.NUM_CROSSOVER_2 = self.NUM_CROSSOVER*2 # 上數的兩倍
        self.NUM_MUTATION = int(self.Pm * self.NUM_CHROME * self.gene_length) # 突變的次數

    # 創建一個初始個體
    def create_individual(self):
        genes_list = [i for i in range(self.gene_length)]
        genes = random.sample(genes_list, self.gene_length)
        return genes

    # 計算適應度
    def fitness(self, individual, user, movies, degree_g, degree_d, degree_a):
        total_score = 0
        for i in individual[:5]:
            movie = movies[i]
            score_g, score_d, score_a = 0, 0, 0
            
            if user['preferences']['genres']:
                for n,g in enumerate(user['preferences']['genres']):
                    score_g += (len(set(movie['genres']).intersection([g])))*degree_g[n]
                score_g /= sum(degree_g)
                total_score += score_g * 70/3
            if user['preferences']['director']:
                for n,d in enumerate(user['preferences']['director']):
                    score_d += (len(set(movie['director']).intersection([d])))*degree_d[n]
                score_d /= sum(degree_d)
                total_score += score_d * 70/3
            if user['preferences']['actors']:
                for n,a in enumerate(user['preferences']['actors']):
                    score_a += (len(set(movie['actors']).intersection([a])))*degree_a[n]
                score_a /= sum(degree_a)
                total_score += score_a * 70/3
            
            total_score += movie['score'] * 2
            total_score += ((movie['year']-1954)/(2022-1954)) * 10
        return total_score

    # 選擇--輪盤式選擇
    def selection(self, population, fitnesses):
        final_a = []
        for i in range(self.NUM_PARENT):
            total_fitness = sum(fitnesses)
            probabilities = [f/total_fitness for f in fitnesses]
            r = random.random()
            cumulative_probability = 0.0
            for j in range(len(population)):
                cumulative_probability += probabilities[j]
                if r <= cumulative_probability:
                    final_a.append(population[j])
                    break
        return final_a

    # 交配--均勻交配
    def crossover(self, parent):
        a = []
        for i in range(self.NUM_CROSSOVER): 
            child1, child2 = np.full((self.gene_length), -1), np.full((self.gene_length), -1)
            [j, k] = np.random.choice(self.NUM_CHROME, 2, replace=False)
            parent1, parent2 = np.array(parent[j]), np.array(parent[k])
            positions = np.random.randint(low=0, high=2, size=self.gene_length)
            child1[np.where(positions == 1)] = parent1[np.where(positions == 1)]
            child2[np.where(positions == 1)] = parent2[np.where(positions == 1)]

            for i in range(self.gene_length):
                if parent2[i] not in child1:
                    for j in range(self.gene_length):
                        if child1[j] == -1:
                            child1[j] = parent2[i]
                            break
                if parent1[i] not in child2:
                    for j in range(self.gene_length):
                        if child2[j] == -1:
                            child2[j] = parent1[i]
                            break
            a.append(list(child1))
            a.append(list(child2))
        return a

    # 突變
    def mutation(self, individual):
        for k in range(self.NUM_MUTATION):
            for i in range(self.NUM_CROSSOVER_2):
                r = np.random.randint(self.gene_length)
                individual[i][r] = random.choice(list(set(range(3649)) - set(individual[i])))
        return individual

    def sortChrome(self, a, a_fit):
        a_index = range(len(a))
        a_fit, a_index = zip(*sorted(zip(a_fit,a_index), reverse=True))
        return [a[i] for i in a_index], a_fit

    def replace(self, p, p_fit, a, a_fit): # 適者生存
        b = np.concatenate((p,a), axis=0)
        b_fit = p_fit + a_fit
        b, b_fit = self.sortChrome(b, b_fit)
        return b[:self.NUM_CHROME], list(b_fit[:self.NUM_CHROME])

ga_func = GA_func()

# 創建初始
def create_population(size):
    return [ga_func.create_individual() for _ in range(size)]
# 執行基因演算法
def genetic_algorithm(population, user, movies, degree_g, degree_d, degree_a):
    best_individual = None
    best_fitness = -1
    
    for i in range(1):
        # 計算每個個體的適應度
        fitnesses = [ga_func.fitness(individual, user, movies, degree_g, degree_d, degree_a) for individual in population]
        
        best_outputs = [] # 用此變數來紀錄每一個迴圈的最佳解 (new)
        best_outputs.append(np.max(fitnesses)) # 存下初始群體的最佳解 (new)
        
        mean_outputs = [] # 用此變數來紀錄每一個迴圈的平均解 (new)
        mean_outputs.append(np.average(fitnesses)) # 存下初始群體的最佳解 (new)
            
        # 選擇最佳個體
        index = fitnesses.index(max(fitnesses))
        if fitnesses[index] > best_fitness:
            best_individual = population[index]
            best_fitness = fitnesses[index]
        # 進行選擇、交配和突變操作，生成新的個體
#         offspring = []
        for i in tqdm(range(20)): #NUM_ITERATION
            parent = ga_func.selection(population, fitnesses)
            child = ga_func.crossover(parent)
            offspring = ga_func.mutation(child)
            
            offspring_fitnesses = [ga_func.fitness(individual, user, movies, degree_g, degree_d, degree_a) for individual in offspring]
            population, fitnesses = ga_func.replace(population, fitnesses, offspring, offspring_fitnesses)
            
            best_outputs.append(np.max(fitnesses)) # 存下這次的最佳解 (new)
            mean_outputs.append(np.average(fitnesses)) # 存下這次的平均解 (new)
            if (i+1)%10==0 or i==0:
                print('iteration %d: x = %s, y = %f' %(i, population[0][:5], fitnesses[0]))
    return best_individual, best_outputs, mean_outputs

In [4]:
def run_rs(user, degree_g, degree_d, degree_a):
    credits = pd.read_csv("netflix/credits.csv")
    titles = pd.read_csv("netflix/titles.csv")

    diff = set(titles['id']) - set(credits['id'])
    titles = titles[~titles['id'].isin(diff)]
    titles = titles[titles['type'] == 'MOVIE']
    
    # 設置基因長度，即電影數量
    movies = []
    with open('Movies.csv', mode='r',encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            desc = {}
            desc['title'] = list(row.items())[0][1]
            desc['genres'] = eval(list(row.items())[1][1])
            desc['director'] = eval(list(row.items())[2][1])
            desc['actors'] = eval(list(row.items())[3][1])
            desc['score'] = float(list(row.items())[4][1])
            desc['year'] = int(list(row.items())[5][1])
            movies.append(desc)

    gene_length = len(movies)

    NUM_CHROME = 10 # 染色體個數
    
    # 執行
    population = create_population(NUM_CHROME)
    best_individual, best_outputs, mean_outputs = genetic_algorithm(population, user, movies, degree_g, degree_d, degree_a)
    recommended_movies_id = [movies[i]['title'] for i in best_individual]
    recommended_movies = list(titles.loc[titles['id'].isin(recommended_movies_id[:5]), 'title'])

    string = 'Recommended movies are:'
    for movie in range(len(recommended_movies)):
        string += f'\n({movie + 1}): {recommended_movies[movie]}'
    return string

In [5]:
user = {'preferences': 
         {'genres': ['animation', 'european', 'crime'],
          'director': ['Marc Meyers', 'Vlad Yudin'],
          'actors': ['Natassia Malthe', 'Noriaki Sugiyama', 'Bret Eric Porter']
         }
       }
degree_g = [8, 9, 6]
degree_d = [3, 1]
degree_a = [9, 2, 9]

In [6]:
best_fitness = run_rs(user, degree_g, degree_d, degree_a)
print(best_fitness)

 10%|████████▎                                                                          | 2/20 [00:00<00:01,  9.07it/s]

iteration 0: x = [ 636 1196 1255  444 1450], y = 141.734818


 55%|█████████████████████████████████████████████                                     | 11/20 [00:01<00:00,  9.03it/s]

iteration 9: x = [2090  916 1779  910 1052], y = 148.935806


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  9.00it/s]

iteration 19: x = [2090  916 1779  910 1052], y = 148.935806
Recommended movies are:
(1): Taxi Driver
(2): Deliverance
(3): Monty Python and the Holy Grail
(4): The Dirty Dozen
(5): Life of Brian



