# 데이터 로드

https://grouplens.org/datasets/movielens/

https://files.grouplens.org/datasets/movielens/ml-10m-README.html

## 전처리 한눈에 보기
 ![nn](img/data.png)

In [116]:
MOVIE_DATA_PATH = "data/movies.csv"
RATING_DATA_PATH = "data/ratings.csv"
TAG_DATA_PATH = "data/tags.csv"

In [117]:
import pandas as pd

movies_df = pd.read_csv(MOVIE_DATA_PATH)
ratings_df = pd.read_csv(RATING_DATA_PATH)
tags_df = pd.read_csv(TAG_DATA_PATH)

In [118]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [119]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [120]:
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


# 전처리

## 컬럼명 변경

In [121]:
# movies_df = movies_df.rename(columns={'movieId': 'movie_id'}, inplace=True)
movies_df = movies_df.rename(columns={'movieId': 'movie_id'})
ratings_df = ratings_df.rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})
tags_df = tags_df.rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})

In [122]:
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [123]:
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [124]:
tags_df

Unnamed: 0,user_id,movie_id,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


## ID 형변환

In [125]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  9742 non-null   int64 
 1   title     9742 non-null   object
 2   genres    9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [126]:
movies_df['movie_id'] = movies_df['movie_id'].astype(str)
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  9742 non-null   object
 1   title     9742 non-null   object
 2   genres    9742 non-null   object
dtypes: object(3)
memory usage: 228.5+ KB


In [127]:
ratings_df['user_id'] = ratings_df['user_id'].astype(str)
ratings_df['movie_id'] = ratings_df['movie_id'].astype(str)
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100836 non-null  object 
 1   movie_id   100836 non-null  object 
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


In [128]:
tags_df['user_id'] = tags_df['user_id'].astype(str)
tags_df['movie_id'] = tags_df['movie_id'].astype(str)
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    3683 non-null   object
 1   movie_id   3683 non-null   object
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 115.2+ KB


## 중복 처리

### 중복체크

In [129]:
# === 장르 / 태그 존재 여부 플래그 만들기 ===

# 1) 장르가 있는지 여부
#  - NaN 이거나 빈 문자열이거나 "(no genres listed)" 인 경우 → 장르 없음으로 처리
movies_df["has_genre"] = (
    movies_df["genres"].notna()
    & (movies_df["genres"].str.strip() != "")
    & (movies_df["genres"] != "(no genres listed)")
)

# 2) 태그가 있는지 여부
#  - tags_df 에서 한 번이라도 tag가 달린 적 있으면 그 영화는 "태그 있음"
tagged_movie_ids = tags_df["movie_id"].unique()
movies_df["has_tag"] = movies_df["movie_id"].isin(tagged_movie_ids)

# === 개수 집계 ===
total_movies = len(movies_df)
no_genre_count = (~movies_df["has_genre"]).sum()
no_tag_count = (~movies_df["has_tag"]).sum()
no_genre_no_tag_count = (~movies_df["has_genre"] & ~movies_df["has_tag"]).sum()

print(f"전체 영화 개수                : {total_movies}")
print(f"장르가 없는 영화 개수         : {no_genre_count}")
print(f"태그가 없는 영화 개수         : {no_tag_count}")
print(f"장르와 태그 모두 없는 영화 개수: {no_genre_no_tag_count}")


전체 영화 개수                : 9742
장르가 없는 영화 개수         : 34
태그가 없는 영화 개수         : 8170
장르와 태그 모두 없는 영화 개수: 33


In [130]:
len(movies_df)  # 무비 데이터 수

9742

In [131]:
movies_df['movie_id'].nunique()  # 무비 id 수

9742

In [132]:
movies_df['title'].nunique()  # 무비 제목 != 무비 id 수
# 미스매치 일어나는 것 해결하고 가야 함

9737

In [133]:
# 제목 중복된 영화
movies_df[movies_df.duplicated('title')]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
5601,26958,Emma (1996),Romance,True,False
6932,64997,War of the Worlds (2005),Action|Sci-Fi,True,False
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller,True,False
9135,147002,Eros (2004),Drama|Romance,True,False
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller,True,False


In [134]:
# 중복 개수까지 체크
movies_df['title'].value_counts()[movies_df['title'].value_counts() > 1]

title
Emma (1996)                               2
War of the Worlds (2005)                  2
Confessions of a Dangerous Mind (2002)    2
Eros (2004)                               2
Saturn 3 (1980)                           2
Name: count, dtype: int64

### 중복값 처리

영화1) Emma

movies 데이터 처리

In [135]:
movie_titile = "Emma (1996)"
movies_df[movies_df['title'] == movie_titile]
#같은 emma인데 장르가 다르게 -> id 달라짐 -> 장르를 merge 해주면 됨

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
650,838,Emma (1996),Comedy|Drama|Romance,True,True
5601,26958,Emma (1996),Romance,True,False


In [136]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
unique_id

'838'

In [137]:
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]
duplicated_id

'26958'

In [138]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


ratings 데이터 처리

In [139]:
ratings_df[ratings_df['movie_id'] == duplicated_id]
#우리가 작업한 거에서는 26958은 지워진 아이디. 통합한 유니크 아이디로 레퍼런스 찾아갈 수 있게 해줘야 함.

Unnamed: 0,user_id,movie_id,rating,timestamp
80596,509,26958,3.5,1436031753


In [140]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([80596], dtype='int64')

In [141]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [142]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [143]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
80596,509,838,3.5,1436031753


영화2) War of the Worlds

movies 데이터 처리

In [144]:
movie_titile = "War of the Worlds (2005)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller,True,False
6932,64997,War of the Worlds (2005),Action|Sci-Fi,True,False


In [145]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
unique_id

'34048'

In [146]:
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]
duplicated_id

'64997'

In [147]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


ratings 데이터 처리

In [148]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
4747,28,64997,3.5,1234850075
11451,68,64997,2.5,1230497715


In [149]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([4747, 11451], dtype='int64')

In [150]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [151]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [152]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
4747,28,34048,3.5,1234850075
11451,68,34048,2.5,1230497715


영화3) Confessions of a Dangerous Mind

movies 데이터 처리

In [153]:
movie_titile = "Confessions of a Dangerous Mind (2002)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller,True,True
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller,True,False


In [154]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]

In [155]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


ratings 데이터 처리

In [156]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
17819,111,144606,4.0,1517441257


In [157]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([17819], dtype='int64')

In [158]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [159]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [160]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
17819,111,6003,4.0,1517441257


영화4) Eros

movies 데이터 처리

In [161]:
movie_titile = "Eros (2004)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
5854,32600,Eros (2004),Drama,True,False
9135,147002,Eros (2004),Drama|Romance,True,False


In [162]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]

In [163]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


ratings 데이터 처리

In [164]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
49832,318,147002,4.0,1502207152


In [165]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([49832], dtype='int64')

In [166]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [167]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [168]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
49832,318,32600,4.0,1502207152


영화5) Saturn 3

movie 데이터 처리

In [169]:
movie_titile = "Saturn 3 (1980)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller,True,False
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller,True,False


In [170]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]

In [171]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


ratings 데이터 처리

In [172]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
81458,514,168358,2.5,1533945970


In [173]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([81458], dtype='int64')

In [174]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [175]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [176]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
81458,514,2851,2.5,1533945970


### 중복값 처리 확인

In [177]:
len(movies_df)

9737

In [178]:
movies_df['movie_id'].nunique()

9737

In [179]:
movies_df['title'].nunique()

9737

## 장르 결측치 처리

Imputation

In [180]:
no_genre_cond = movies_df['genres'].str.contains(' ')
movies_df[no_genre_cond]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
8517,114335,La cravate (1957),(no genres listed),False,False
8684,122888,Ben-hur (2016),(no genres listed),False,False
8687,122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed),False,False
8782,129250,Superfast! (2015),(no genres listed),False,False
8836,132084,Let It Be Me (1995),(no genres listed),False,False
8902,134861,Trevor Noah: African American (2013),(no genres listed),False,False
9033,141131,Guardians (2016),(no genres listed),False,False
9053,141866,Green Room (2015),(no genres listed),False,False
9070,142456,The Brand New Testament (2015),(no genres listed),False,False
9091,143410,Hyena Road,(no genres listed),False,False


In [181]:
len(movies_df[no_genre_cond])

34

In [182]:
len(movies_df[~no_genre_cond])

9703

In [183]:
# 장르 없으면 사용안할 것
del_movie_ids = movies_df.loc[no_genre_cond, 'movie_id'].tolist()
len(del_movie_ids)

34

In [184]:
movies_df = movies_df[~movies_df['movie_id'].isin(del_movie_ids)].copy()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


In [185]:
# 검산: 삭제 id 데이터는 없어야 한
movies_df[no_genre_cond]

  movies_df[no_genre_cond]


Unnamed: 0,movie_id,title,genres,has_genre,has_tag


ratings 데이터도 처리 필요

In [186]:
len(ratings_df)

100836

In [187]:
# 삭제할 영화 관련 데이터
len(ratings_df[ratings_df['movie_id'].isin(del_movie_ids)])

47

In [188]:
100836 - 47  # 삭제 후 데이터 개수

100789

In [189]:
ratings_df = ratings_df[~ratings_df['movie_id'].isin(del_movie_ids)].copy()
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


## (추가된 부분) ★  복수 평점 처리  ★ 
한 유저가 한 영화에 대해 여러번 평점을 남겼을 때 최근 평점을 선택한다

In [190]:
from IPython.display import display

# 중복 제거 전 상태 보관
ratings_before = ratings_df.copy()

# >> STEP 1 << (user_id, movie_id) 별로 평점이 몇 번 남았는지 집계
pair_counts = (
    ratings_df
    .groupby(["user_id", "movie_id"])
    .size()
    .reset_index(name="n_ratings")
)

# 같은 영화에 2번 이상 평점을 남긴 경우만 필터링
multi_rated_pairs = pair_counts[pair_counts["n_ratings"] > 1]

print(f"한 영화에 여러 번 평점을 남긴 (user_id, movie_id) 쌍 개수: {len(multi_rated_pairs)}")
print("\n=== 여러 번 평점을 남긴 쌍 Top 10 ===")
display(
    multi_rated_pairs
    .sort_values("n_ratings", ascending=False)
    .head(10)
)

한 영화에 여러 번 평점을 남긴 (user_id, movie_id) 쌍 개수: 4

=== 여러 번 평점을 남긴 쌍 Top 10 ===


Unnamed: 0,user_id,movie_id,n_ratings
2847,111,6003,2
29460,28,34048,2
72313,509,838,2
96090,68,34048,2


In [191]:
# >> STEP 2 << 샘플 몇 개 골라서 중복 제거 전 상태 확인
if len(multi_rated_pairs) > 0:
    sample_pairs = multi_rated_pairs.sample(
        min(5, len(multi_rated_pairs)),
        random_state=42,
    )

    print("\n=== 중복 제거 전: 같은 영화에 여러 번 평점을 남긴 예시 ===")
    before_sample = (
        ratings_df
        .merge(sample_pairs[["user_id", "movie_id"]], on=["user_id", "movie_id"], how="inner")
        .sort_values(["user_id", "movie_id", "timestamp"])
    )

    display(
        before_sample[["user_id", "movie_id", "rating", "timestamp"]]
    )
else:
    print("\n여러 번 평점을 남긴 (user_id, movie_id) 쌍이 없습니다.")


=== 중복 제거 전: 같은 영화에 여러 번 평점을 남긴 예시 ===


Unnamed: 0,user_id,movie_id,rating,timestamp
4,111,6003,4.0,1516468531
5,111,6003,4.0,1517441257
0,28,34048,3.5,1234516420
1,28,34048,3.5,1234850075
6,509,838,3.5,1436031723
7,509,838,3.5,1436031753
2,68,34048,2.0,1158532246
3,68,34048,2.5,1230497715


In [192]:
# >> STEP 3 << 가장 최신 timestamp의 평점만 남기기
ratings_df = (
    ratings_df
    .sort_values("timestamp")  # 오래된 → 최신 순으로 정렬
    .drop_duplicates(subset=["user_id", "movie_id"], keep="last")
    .reset_index(drop=True)
)

print("\n=== 중복 제거 전/후 row 수 비교 ===")
print(f"- 제거 전: {len(ratings_before)} rows")
print(f"- 제거 후: {len(ratings_df)} rows")


=== 중복 제거 전/후 row 수 비교 ===
- 제거 전: 100789 rows
- 제거 후: 100785 rows


In [193]:
# >> STEP 4 << 같은 쌍에 대해 중복 제거 후 어떤 평점이 남았는지 확인
if len(multi_rated_pairs) > 0:
    print("\n=== 중복 제거 후: 같은 (user_id, movie_id) 쌍에 대해 최종 남은 평점 ===")
    after_sample = (
        ratings_df
        .merge(sample_pairs[["user_id", "movie_id"]], on=["user_id", "movie_id"], how="inner")
        .sort_values(["user_id", "movie_id"])
    )

    display(
        after_sample[["user_id", "movie_id", "rating", "timestamp"]]
    )


=== 중복 제거 후: 같은 (user_id, movie_id) 쌍에 대해 최종 남은 평점 ===


Unnamed: 0,user_id,movie_id,rating,timestamp
3,111,6003,4.0,1517441257
1,28,34048,3.5,1234850075
2,509,838,3.5,1436031753
0,68,34048,2.5,1230497715


## 그외 결측치 체크

In [194]:
movies_df.columns[movies_df.isna().any()].tolist()

[]

## 인덱스 정리

In [195]:
# 중복 제거 작업으로 인덱스 불일치 상태
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9739,193585,Flint (2017),Drama,True,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


In [196]:
movies_df = movies_df.reset_index(drop=True)
movies_df
# 중요!! 삭제 과정 후에는 항상 reset_index를 해야 한다.(pandas에서 했다면 반드시)

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,True,True
1,2,Jumanji (1995),Adventure|Children|Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy|Romance,True,True
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,True,False
9699,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,True,False
9700,193585,Flint (2017),Drama,True,False
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,True,False


In [197]:
# 중복 제거 작업으로 인덱스 불일치 상태
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615
...,...,...,...,...
100780,514,187031,2.5,1537674927
100781,514,187595,3.0,1537674946
100782,514,5247,2.5,1537757040
100783,514,5246,1.5,1537757059


In [198]:
ratings_df = ratings_df.reset_index(drop=True)
ratings_df
# 중요!! 삭제 과정 후에는 항상 reset_index를 해야 한다.(pandas에서 했다면 반드시)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615
...,...,...,...,...
100780,514,187031,2.5,1537674927
100781,514,187595,3.0,1537674946
100782,514,5247,2.5,1537757040
100783,514,5246,1.5,1537757059


## 구분자 변경

TfidfVectorizer 사용하려면 space 를 seperator 로 써야

In [199]:
# 장르 이름에는 빈칸 없다
# 만약 빈칸 있으면 _ 붙이는 처리 필요
# 예) k drama|fun --> k drama fun  (X)
#                 --> k_drama fun  (O)
movies_df[movies_df['genres'].str.contains(' ')]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag


In [200]:
movies_df['genres']

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9698                Action|Animation|Comedy|Fantasy
9699                       Animation|Comedy|Fantasy
9700                                          Drama
9701                               Action|Animation
9702                                         Comedy
Name: genres, Length: 9703, dtype: object

In [201]:
movies_df['genres'].str.replace('\|',' ', regex=True)

0       Adventure Animation Children Comedy Fantasy
1                        Adventure Children Fantasy
2                                    Comedy Romance
3                              Comedy Drama Romance
4                                            Comedy
                           ...                     
9698                Action Animation Comedy Fantasy
9699                       Animation Comedy Fantasy
9700                                          Drama
9701                               Action Animation
9702                                         Comedy
Name: genres, Length: 9703, dtype: object

In [202]:
movies_df['genres'] = movies_df['genres'].str.replace(r'\|',' ', regex=True)
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,True,True
1,2,Jumanji (1995),Adventure Children Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy Romance,True,True
3,4,Waiting to Exhale (1995),Comedy Drama Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,True,False
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,True,False
9700,193585,Flint (2017),Drama,True,False
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,True,False


## 연도 결측치 처리

In [203]:
# 제목에 연도 없는 데이터 검색
# movies_df[~movies_df['title'].str.contains('(')]  <-- \( 사용해야함
movies_df[~movies_df['title'].str.contains(r'\(')]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
6058,40697,Babylon 5,Sci-Fi,True,False
9023,140956,Ready Player One,Action Sci-Fi Thriller,True,False
9163,149334,Nocturnal Animals,Drama Thriller,True,False
9345,162414,Moonlight,Drama,True,False


웹 검색으로 영화 연도 찾은 결과
```
'Babylon 5': 1994
'Ready Player One': 2018
'Nocturnal Animals': 2017
'Moonlight': 2009
```

In [204]:
tmp = movies_df.copy()

In [205]:
tmp.loc[6058, 'title'] = tmp.loc[6058, 'title'] + " (1994)"
tmp.loc[[6058]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
6058,40697,Babylon 5 (1994),Sci-Fi,True,False


In [206]:
# 중복 실행 시 연도 중첩 방지 필요
tmp.loc[6058, 'title'] = tmp.loc[6058, 'title'] + " (1994)"
tmp.loc[[6058]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
6058,40697,Babylon 5 (1994) (1994),Sci-Fi,True,False


In [207]:
# 연도 없는지 체크
'(' not in tmp.loc[6058, 'title']

False

In [208]:
tmp = movies_df.copy()
idx = 6058
year = 1994

if '(' not in tmp.loc[idx, 'title']:
    tmp.loc[idx, 'title'] = tmp.loc[idx, 'title'] + f" ({year})"

tmp.loc[[idx]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
6058,40697,Babylon 5 (1994),Sci-Fi,True,False


In [209]:
# 직접 입력
idx = 6058
year = 1994
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
6058,40697,Babylon 5 (1994),Sci-Fi,True,False


In [210]:
# 직접 입력
idx = 9023
year = 2018
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
9023,140956,Ready Player One (2018),Action Sci-Fi Thriller,True,False


In [211]:
# 직접 입력
idx = 9163
year = 2017
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
9163,149334,Nocturnal Animals (2017),Drama Thriller,True,False


In [212]:
# 직접 입력
idx = 9345
year = 2009
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
9345,162414,Moonlight (2009),Drama,True,False


In [213]:
# 검산: 제목에 연도 없는 데이터 검색
movies_df[~movies_df['title'].str.contains(r'\(')]

Unnamed: 0,movie_id,title,genres,has_genre,has_tag


## 영화명/연도 분리

### 연습

In [214]:
"Waiting to Exhale (1995)".rsplit(' ', 1)

['Waiting to Exhale', '(1995)']

In [215]:
# 여백 주의! split에 영향줌
"Waiting to Exhale (1995) ".rsplit(' ', 1)

['Waiting to Exhale (1995)', '']

In [216]:
"Waiting to Exhale (1995) ".strip()

'Waiting to Exhale (1995)'

In [217]:
"Waiting to Exhale (1995) ".strip().rsplit(' ', 1)

['Waiting to Exhale', '(1995)']

In [218]:
movies_df['title'].str.rsplit(' ', n=1)

0                                [Toy Story, (1995)]
1                                  [Jumanji, (1995)]
2                         [Grumpier Old Men, (1995)]
3                        [Waiting to Exhale, (1995)]
4              [Father of the Bride Part II, (1995)]
                            ...                     
9698    [Black Butler: Book of the Atlantic, (2017)]
9699                 [No Game No Life: Zero, (2017)]
9700                                 [Flint, (2017)]
9701          [Bungo Stray Dogs: Dead Apple, (2018)]
9702          [Andrew Dice Clay: Dice Rules, (1991)]
Name: title, Length: 9703, dtype: object

In [219]:
movies_df['title'].str.rsplit(' ', n=1, expand=True)

Unnamed: 0,0,1
0,Toy Story,(1995)
1,Jumanji,(1995)
2,Grumpier Old Men,(1995)
3,Waiting to Exhale,(1995)
4,Father of the Bride Part II,(1995)
...,...,...
9698,Black Butler: Book of the Atlantic,(2017)
9699,No Game No Life: Zero,(2017)
9700,Flint,(2017)
9701,Bungo Stray Dogs: Dead Apple,(2018)


### 적용

In [220]:
movies_df['title'] = movies_df['title'].str.strip()
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,True,True
1,2,Jumanji (1995),Adventure Children Fantasy,True,True
2,3,Grumpier Old Men (1995),Comedy Romance,True,True
3,4,Waiting to Exhale (1995),Comedy Drama Romance,True,False
4,5,Father of the Bride Part II (1995),Comedy,True,True
...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,True,False
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,True,False
9700,193585,Flint (2017),Drama,True,False
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,True,False


In [221]:
movies_df[['title_only', 'year']] = movies_df['title'].str.rsplit(' ', n=1, expand=True)
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag,title_only,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,True,True,Toy Story,(1995)
1,2,Jumanji (1995),Adventure Children Fantasy,True,True,Jumanji,(1995)
2,3,Grumpier Old Men (1995),Comedy Romance,True,True,Grumpier Old Men,(1995)
3,4,Waiting to Exhale (1995),Comedy Drama Romance,True,False,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II (1995),Comedy,True,True,Father of the Bride Part II,(1995)
...,...,...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,True,False,Black Butler: Book of the Atlantic,(2017)
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,True,False,No Game No Life: Zero,(2017)
9700,193585,Flint (2017),Drama,True,False,Flint,(2017)
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,True,False,Bungo Stray Dogs: Dead Apple,(2018)


In [222]:
# 연도에 괄호 제거하고 int 타입 변환
movies_df['year'] = movies_df['year'].apply(lambda x: x[1:-1])
movies_df['year'] = movies_df['year'].astype(int)
movies_df

Unnamed: 0,movie_id,title,genres,has_genre,has_tag,title_only,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,True,True,Toy Story,1995
1,2,Jumanji (1995),Adventure Children Fantasy,True,True,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy Romance,True,True,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy Drama Romance,True,False,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,True,True,Father of the Bride Part II,1995
...,...,...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,True,False,Black Butler: Book of the Atlantic,2017
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,True,False,No Game No Life: Zero,2017
9700,193585,Flint (2017),Drama,True,False,Flint,2017
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,True,False,Bungo Stray Dogs: Dead Apple,2018


In [223]:
# 영화명 같이만 개봉 연도 다른 리메이크작 존재함
# title_only 보다 title 그대로 사용
movies_df = movies_df[['movie_id', 'title', 'year', 'genres']]
movies_df

Unnamed: 0,movie_id,title,year,genres
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),1995,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),1995,Comedy Romance
3,4,Waiting to Exhale (1995),1995,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),2017,Action Animation Comedy Fantasy
9699,193583,No Game No Life: Zero (2017),2017,Animation Comedy Fantasy
9700,193585,Flint (2017),2017,Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),2018,Action Animation


# 데이터 저장

In [224]:
movie_file_path = "data/movies_refined.csv"
movie_file_path

'data/movies_refined.csv'

In [225]:
movies_df.to_csv(movie_file_path, index=False)
movies_df

Unnamed: 0,movie_id,title,year,genres
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),1995,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),1995,Comedy Romance
3,4,Waiting to Exhale (1995),1995,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),2017,Action Animation Comedy Fantasy
9699,193583,No Game No Life: Zero (2017),2017,Animation Comedy Fantasy
9700,193585,Flint (2017),2017,Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),2018,Action Animation


In [226]:
ratings_file_path = "data/ratings_refined.csv"
ratings_file_path

'data/ratings_refined.csv'

In [227]:
ratings_df.to_csv(ratings_file_path, index=False)
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615
...,...,...,...,...
100780,514,187031,2.5,1537674927
100781,514,187595,3.0,1537674946
100782,514,5247,2.5,1537757040
100783,514,5246,1.5,1537757059


# 참고

Timstamp -> Datetime

In [228]:
from datetime import datetime
datetime.fromtimestamp(964982703)

datetime.datetime(2000, 7, 31, 3, 45, 3)

In [229]:
time_unit = 2
datetime.fromtimestamp(964982703 / time_unit)

datetime.datetime(1985, 4, 16, 18, 22, 31, 500000)

In [230]:
from datetime import datetime
tmp = ratings_df.copy()
time_unit = 2
tmp['datetime'] = tmp['timestamp'].apply(lambda x: datetime.fromtimestamp(x/time_unit))
tmp

Unnamed: 0,user_id,movie_id,rating,timestamp,datetime
0,429,420,2.0,828124615,1983-02-14 18:18:27.500
1,429,222,4.0,828124615,1983-02-14 18:18:27.500
2,429,343,3.0,828124615,1983-02-14 18:18:27.500
3,429,349,3.0,828124615,1983-02-14 18:18:27.500
4,429,351,4.0,828124615,1983-02-14 18:18:27.500
...,...,...,...,...,...
100780,514,187031,2.5,1537674927,1994-05-13 22:57:43.500
100781,514,187595,3.0,1537674946,1994-05-13 22:57:53.000
100782,514,5247,2.5,1537757040,1994-05-14 10:22:00.000
100783,514,5246,1.5,1537757059,1994-05-14 10:22:09.500
