In [None]:
# 평가 받은 패턴이 유사한 영화로 추천
# 토이스토리를 5점으로 평가한 사용자에게 토이스토리와 비슷한 평가 패턴을 보이는 영화를 추천


# google drive 마운트
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

# 사용자 정보 데이터
user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv("/content/drive/My Drive/data/u.user", sep="|", names=user_cols)

# 영화 정보 데이터
movie_cols = ["movie_id", "title", "release date", "video release date", "IMDB URL", "unknown", "Action", "Adventure",
              "Animation", "Children's", "Comedy",
               "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
               "Sci-Fi", "Thriller", "War", "Western"]
movies = pd.read_csv("/content/drive/My Drive/data/u.item", sep="|", names=movie_cols, encoding="ISO-8859-1")

# 평가 데이터
rating_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv("/content/drive/My Drive/data/u.data", sep="\t", names=rating_cols)

# timestamp 제거
ratings = ratings.drop("timestamp", axis=1)
# movie ID와 title빼고 나머지 제거
movies = movies[["movie_id", "title"]]

In [None]:
print(users.head(5))
print(movies.head(5))
display(ratings.head(5))

   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [None]:
# movies 테이블의 영화 정보에 평가 개수(#_of_ratings) 열 추가
movies = movies.set_index("movie_id")
movies["#_of_ratings"] = pd.DataFrame(ratings.groupby("movie_id")["rating"].count())
display(movies)

Unnamed: 0_level_0,title,#_of_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),452
2,GoldenEye (1995),131
3,Four Rooms (1995),90
4,Get Shorty (1995),209
5,Copycat (1995),86
...,...,...
1678,Mat' i syn (1997),1
1679,B. Monkey (1998),1
1680,Sliding Doors (1998),1
1681,You So Crazy (1994),1


In [None]:
# 평가 데이터(ratings)에 영화 정보(movies) 테이블의 컬럼 추가.

ratings = pd.merge(ratings, movies, on="movie_id")
display(ratings)

Unnamed: 0,user_id,movie_id,rating,title,#_of_ratings
0,196,242,3,Kolya (1996),117
1,63,242,3,Kolya (1996),117
2,226,242,5,Kolya (1996),117
3,154,242,3,Kolya (1996),117
4,306,242,5,Kolya (1996),117
...,...,...,...,...,...
99995,840,1674,4,Mamma Roma (1962),1
99996,655,1640,3,"Eighth Day, The (1996)",1
99997,655,1637,3,Girls Town (1996),1
99998,655,1630,3,"Silence of the Palace, The (Saimt el Qusur) (1...",1


In [None]:
# full matrix 생성

rating_matrix = ratings.pivot_table(values="rating", index="user_id", columns="title")
rating_matrix.head(5)

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",8 1/2 (1963),8 Heads in a Duffel Bag (1997),8 Seconds (1994),A Chef in Love (1996),Above the Rim (1994),Absolute Power (1997),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Across the Sea of Time (1995),Addams Family Values (1993),Addicted to Love (1997),"Addiction, The (1995)","Adventures of Pinocchio, The (1996)","Adventures of Priscilla, Queen of the Desert, The (1994)","Adventures of Robin Hood, The (1938)","Affair to Remember, An (1957)","African Queen, The (1951)",Afterglow (1997),"Age of Innocence, The (1993)",Aiqing wansui (1994),Air Bud (1997),Air Force One (1997),"Air Up There, The (1994)",Airheads (1994),Akira (1988),Aladdin (1992),Aladdin and the King of Thieves (1996),Alaska (1996),Albino Alligator (1996),...,"Whole Wide World, The (1996)",Widows' Peak (1994),"Wife, The (1995)",Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wild Reeds (1994),Wild Things (1998),William Shakespeare's Romeo and Juliet (1996),Willy Wonka and the Chocolate Factory (1971),Window to Paris (1994),Wings of Courage (1995),Wings of Desire (1987),"Wings of the Dove, The (1997)",Winnie the Pooh and the Blustery Day (1968),"Winter Guest, The (1997)",Wishmaster (1997),With Honors (1994),Withnail and I (1987),Witness (1985),"Wizard of Oz, The (1939)",Wolf (1994),"Woman in Question, The (1950)","Women, The (1939)","Wonderful, Horrible Life of Leni Riefenstahl, The (1993)",Wonderland (1997),"Wooden Man's Bride, The (Wu Kui) (1994)","World of Apu, The (Apur Sansar) (1959)","Wrong Trousers, The (1993)",Wyatt Earp (1994),Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,,,2.0,5.0,,,3.0,4.0,,,,,,,,,3.0,3.0,,,,,,,,,,,,,,1.0,,,,4.0,4.0,,,,...,,,,,,,,,,4.0,,,,,,,,,,,4.0,,,,,,,,5.0,,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,,,,,,3.0,,,,,,,,,,,,,,,,,4.0,,,,,,,,...,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,
5,,,2.0,,,,,4.0,,,,,,,,,,,1.0,,2.0,,,,5.0,,,,,3.0,,,,,,,4.0,4.0,,,...,,,,,,,,,1.0,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,4.0,,,,,4.0,


In [None]:
# 토이스토리 평점
toystory_ratings = rating_matrix["Toy Story (1995)"]
print(toystory_ratings)

user_id
1      5.0
2      4.0
3      NaN
4      NaN
5      4.0
      ... 
939    NaN
940    NaN
941    5.0
942    NaN
943    NaN
Name: Toy Story (1995), Length: 943, dtype: float64


In [None]:
# 전체 영화 대상으로 토이스토리와의 상관계수를 계산
# corrwith 모든 변수간의 상관관계

corr_toystory = rating_matrix.corrwith(toystory_ratings)
print(corr_toystory)

# 1에 가까울 수록 같은 방향, -1에 가까울수록 반대 방향, 0이면 관계 없음

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


title
'Til There Was You (1997)                0.534522
1-900 (1994)                                  NaN
101 Dalmatians (1996)                    0.232118
12 Angry Men (1957)                      0.334943
187 (1997)                               0.651857
                                           ...   
Young Guns II (1990)                     0.146312
Young Poisoner's Handbook, The (1995)   -0.026402
Zeus and Roxanne (1997)                  0.447914
unknown                                  0.440959
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64


In [None]:
print(type(corr_toystory))

corr_toystory = pd.DataFrame(corr_toystory, columns=["Correlation"])
print(corr_toystory)

print(type(corr_toystory))

<class 'pandas.core.series.Series'>
                                       Correlation
title                                             
'Til There Was You (1997)                 0.534522
1-900 (1994)                                   NaN
101 Dalmatians (1996)                     0.232118
12 Angry Men (1957)                       0.334943
187 (1997)                                0.651857
...                                            ...
Young Guns II (1990)                      0.146312
Young Poisoner's Handbook, The (1995)    -0.026402
Zeus and Roxanne (1997)                   0.447914
unknown                                   0.440959
Á köldum klaka (Cold Fever) (1994)             NaN

[1664 rows x 1 columns]
<class 'pandas.core.frame.DataFrame'>


In [None]:
corr_toystory = pd.merge(corr_toystory, movies, on="title")
display(corr_toystory)

Unnamed: 0,title,Correlation,#_of_ratings
0,'Til There Was You (1997),0.534522,9
1,1-900 (1994),,5
2,101 Dalmatians (1996),0.232118,109
3,12 Angry Men (1957),0.334943,125
4,187 (1997),0.651857,41
...,...,...,...
1677,Young Guns II (1990),0.146312,44
1678,"Young Poisoner's Handbook, The (1995)",-0.026402,41
1679,Zeus and Roxanne (1997),0.447914,6
1680,unknown,0.440959,9


In [None]:
# 평가 개수(#_of_ratings)가 50 이상인 영화들만 남김

corr_toystory = corr_toystory[corr_toystory["#_of_ratings"] > 50]
display(corr_toystory)

Unnamed: 0,title,Correlation,#_of_ratings
2,101 Dalmatians (1996),0.232118,109
3,12 Angry Men (1957),0.334943,125
5,2 Days in the Valley (1996),0.162728,93
6,"20,000 Leagues Under the Sea (1954)",0.328472,72
7,2001: A Space Odyssey (1968),-0.069060,259
...,...,...,...
1662,"Wizard of Oz, The (1939)",0.352698,246
1663,Wolf (1994),0.303789,67
1670,"Wrong Trousers, The (1993)",0.188517,118
1675,Young Frankenstein (1974),0.239244,200


In [None]:
# 상관계수가 높은 상위 5개를 추천

corr_toystory.sort_values(by="Correlation", ascending=False).tail(6)

Unnamed: 0,title,Correlation,#_of_ratings
1502,Three Colors: Red (1994),-0.156085,83
666,Harold and Maude (1971),-0.181697,121
748,In the Company of Men (1997),-0.230604,66
484,Excess Baggage (1997),-0.243406,52
940,"Man Who Knew Too Little, The (1997)",-0.275146,52
1092,"Nosferatu (Nosferatu, eine Symphonie des Graue...",-0.307207,54
