# 데이터 로드

https://grouplens.org/datasets/movielens/

https://files.grouplens.org/datasets/movielens/ml-10m-README.html

In [1]:
MOVIE_DATA_PATH = "data/movies.csv"
RATING_DATA_PATH = "data/ratings.csv"
TAG_DATA_PATH = "data/tags.csv"

In [2]:
import pandas as pd

movies_df = pd.read_csv(MOVIE_DATA_PATH)
ratings_df = pd.read_csv(RATING_DATA_PATH)
tags_df = pd.read_csv(TAG_DATA_PATH)

In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


# 전처리

## 컬럼명 변경

In [6]:
# movies_df = movies_df.rename(columns={'movieId': 'movie_id'}, inplace=True)
movies_df = movies_df.rename(columns={'movieId': 'movie_id'})
ratings_df = ratings_df.rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})
tags_df = tags_df.rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})

In [7]:
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [8]:
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
tags_df

Unnamed: 0,user_id,movie_id,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


## ID 형변환

In [10]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  9742 non-null   int64 
 1   title     9742 non-null   object
 2   genres    9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [11]:
movies_df['movie_id'] = movies_df['movie_id'].astype(str)
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  9742 non-null   object
 1   title     9742 non-null   object
 2   genres    9742 non-null   object
dtypes: object(3)
memory usage: 228.5+ KB


In [12]:
ratings_df['user_id'] = ratings_df['user_id'].astype(str)
ratings_df['movie_id'] = ratings_df['movie_id'].astype(str)
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100836 non-null  object 
 1   movie_id   100836 non-null  object 
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


In [13]:
tags_df['user_id'] = tags_df['user_id'].astype(str)
tags_df['movie_id'] = tags_df['movie_id'].astype(str)
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    3683 non-null   object
 1   movie_id   3683 non-null   object
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 115.2+ KB


## 중복 처리

### 중복체크

In [14]:
len(movies_df)  # 무비 데이터 수

9742

In [15]:
movies_df['movie_id'].nunique()  # 무비 id 수

9742

In [16]:
movies_df['title'].nunique()  # 무비 제목 != 무비 id 수
# 미스매치 일어나는 것 해결하고 가야 함

9737

In [17]:
# 제목 중복된 영화
movies_df[movies_df.duplicated('title')]

Unnamed: 0,movie_id,title,genres
5601,26958,Emma (1996),Romance
6932,64997,War of the Worlds (2005),Action|Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
9135,147002,Eros (2004),Drama|Romance
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


In [18]:
# 중복 개수까지 체크
movies_df['title'].value_counts()[movies_df['title'].value_counts() > 1]

title
Emma (1996)                               2
War of the Worlds (2005)                  2
Confessions of a Dangerous Mind (2002)    2
Eros (2004)                               2
Saturn 3 (1980)                           2
Name: count, dtype: int64

### 중복값 처리

영화1) Emma

movies 데이터 처리

In [19]:
movie_titile = "Emma (1996)"
movies_df[movies_df['title'] == movie_titile]
#같은 emma인데 장르가 다르게 -> id 달라짐 -> 장르를 merge 해주면 됨

Unnamed: 0,movie_id,title,genres
650,838,Emma (1996),Comedy|Drama|Romance
5601,26958,Emma (1996),Romance


In [20]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
unique_id

'838'

In [21]:
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]
duplicated_id

'26958'

In [22]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


ratings 데이터 처리

In [23]:
ratings_df[ratings_df['movie_id'] == duplicated_id]
#우리가 작업한 거에서는 26958은 지워진 아이디. 통합한 유니크 아이디로 레퍼런스 찾아갈 수 있게 해줘야 함.

Unnamed: 0,user_id,movie_id,rating,timestamp
80596,509,26958,3.5,1436031753


In [24]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([80596], dtype='int64')

In [25]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [26]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [27]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
80596,509,838,3.5,1436031753


영화2) War of the Worlds

movies 데이터 처리

In [28]:
movie_titile = "War of the Worlds (2005)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi


In [29]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
unique_id

'34048'

In [30]:
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]
duplicated_id

'64997'

In [31]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


ratings 데이터 처리

In [32]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
4747,28,64997,3.5,1234850075
11451,68,64997,2.5,1230497715


In [33]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([4747, 11451], dtype='int64')

In [34]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [35]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [36]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
4747,28,34048,3.5,1234850075
11451,68,34048,2.5,1230497715


영화3) Confessions of a Dangerous Mind

movies 데이터 처리

In [37]:
movie_titile = "Confessions of a Dangerous Mind (2002)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller


In [38]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]

In [39]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


ratings 데이터 처리

In [40]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
17819,111,144606,4.0,1517441257


In [41]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([17819], dtype='int64')

In [42]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [43]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [44]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
17819,111,6003,4.0,1517441257


영화4) Eros

movies 데이터 처리

In [45]:
movie_titile = "Eros (2004)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres
5854,32600,Eros (2004),Drama
9135,147002,Eros (2004),Drama|Romance


In [46]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]

In [47]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


ratings 데이터 처리

In [48]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
49832,318,147002,4.0,1502207152


In [49]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([49832], dtype='int64')

In [50]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [51]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [52]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
49832,318,32600,4.0,1502207152


영화5) Saturn 3

movie 데이터 처리

In [53]:
movie_titile = "Saturn 3 (1980)"
movies_df[movies_df['title'] == movie_titile]

Unnamed: 0,movie_id,title,genres
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


In [54]:
unique_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[0]
duplicated_id = movies_df[movies_df['title'] == movie_titile]['movie_id'].values[1]

In [55]:
# 중복 데이터 제거
movies_df = movies_df[~(movies_df['movie_id'] == duplicated_id)].copy()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


ratings 데이터 처리

In [56]:
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp
81458,514,168358,2.5,1533945970


In [57]:
duplicated_index = ratings_df[ratings_df['movie_id'] == duplicated_id].index
duplicated_index

Index([81458], dtype='int64')

In [58]:
ratings_df.loc[duplicated_index, 'movie_id'] = unique_id

In [59]:
# 검산: 삭제 여부 확인
ratings_df[ratings_df['movie_id'] == duplicated_id]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [60]:
# 검산: 중복 id 변경 확인
ratings_df.loc[duplicated_index]

Unnamed: 0,user_id,movie_id,rating,timestamp
81458,514,2851,2.5,1533945970


### 중복값 처리 확인

In [61]:
len(movies_df)

9737

In [62]:
movies_df['movie_id'].nunique()

9737

In [63]:
movies_df['title'].nunique()

9737

## 장르 결측치 처리

Imputation

In [64]:
no_genre_cond = movies_df['genres'].str.contains(' ')
movies_df[no_genre_cond]

Unnamed: 0,movie_id,title,genres
8517,114335,La cravate (1957),(no genres listed)
8684,122888,Ben-hur (2016),(no genres listed)
8687,122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed)
8782,129250,Superfast! (2015),(no genres listed)
8836,132084,Let It Be Me (1995),(no genres listed)
8902,134861,Trevor Noah: African American (2013),(no genres listed)
9033,141131,Guardians (2016),(no genres listed)
9053,141866,Green Room (2015),(no genres listed)
9070,142456,The Brand New Testament (2015),(no genres listed)
9091,143410,Hyena Road,(no genres listed)


In [65]:
len(movies_df[no_genre_cond])

34

In [66]:
len(movies_df[~no_genre_cond])

9703

In [67]:
# 장르 없으면 사용안할 것
del_movie_ids = movies_df.loc[no_genre_cond, 'movie_id'].tolist()
len(del_movie_ids)

34

In [68]:
movies_df = movies_df[~movies_df['movie_id'].isin(del_movie_ids)].copy()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [69]:
# 검산: 삭제 id 데이터는 없어야 한
movies_df[no_genre_cond]

  movies_df[no_genre_cond]


Unnamed: 0,movie_id,title,genres


ratings 데이터도 처리 필요

In [70]:
len(ratings_df)

100836

In [71]:
# 삭제할 영화 관련 데이터
len(ratings_df[ratings_df['movie_id'].isin(del_movie_ids)])

47

In [72]:
100836 - 47  # 삭제 후 데이터 개수

100789

In [73]:
ratings_df = ratings_df[~ratings_df['movie_id'].isin(del_movie_ids)].copy()
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


## 한 유저가 한 영화에 대해 평점을 여러번 남겼을 때의 처리

In [74]:
from IPython.display import display

## 한 유저가 한 영화에 대해 여러번 평점을 남겼을 때의 처리
# - 최근 평점으로 처리

# 0) 중복 제거 전 상태를 따로 보관해두고 싶다면 (선택)
ratings_before = ratings_df.copy()

# 1) (user_id, movie_id) 별로 평점이 몇 번 남았는지 집계
pair_counts = (
    ratings_df
    .groupby(["user_id", "movie_id"])
    .size()
    .reset_index(name="n_ratings")
)

# 같은 영화에 2번 이상 평점을 남긴 경우만 필터링
multi_rated_pairs = pair_counts[pair_counts["n_ratings"] > 1]

print(f"한 영화에 여러 번 평점을 남긴 (user_id, movie_id) 쌍 개수: {len(multi_rated_pairs)}")
print("\n=== 여러 번 평점을 남긴 쌍 Top 10 ===")
display(
    multi_rated_pairs
    .sort_values("n_ratings", ascending=False)
    .head(10)
)

한 영화에 여러 번 평점을 남긴 (user_id, movie_id) 쌍 개수: 4

=== 여러 번 평점을 남긴 쌍 Top 10 ===


Unnamed: 0,user_id,movie_id,n_ratings
2847,111,6003,2
29460,28,34048,2
72313,509,838,2
96090,68,34048,2


In [75]:
# 2) 샘플 몇 개 골라서 "중복 제거 전" 실제 row 보기
if len(multi_rated_pairs) > 0:
    sample_pairs = multi_rated_pairs.sample(
        min(5, len(multi_rated_pairs)),
        random_state=42,
    )

    print("\n=== 중복 제거 전: 같은 영화에 여러 번 평점을 남긴 예시 ===")
    before_sample = (
        ratings_df
        .merge(sample_pairs[["user_id", "movie_id"]], on=["user_id", "movie_id"], how="inner")
        .sort_values(["user_id", "movie_id", "timestamp"])
    )

    display(
        before_sample[["user_id", "movie_id", "rating", "timestamp"]]
    )
else:
    print("\n여러 번 평점을 남긴 (user_id, movie_id) 쌍이 없습니다.")


=== 중복 제거 전: 같은 영화에 여러 번 평점을 남긴 예시 ===


Unnamed: 0,user_id,movie_id,rating,timestamp
4,111,6003,4.0,1516468531
5,111,6003,4.0,1517441257
0,28,34048,3.5,1234516420
1,28,34048,3.5,1234850075
6,509,838,3.5,1436031723
7,509,838,3.5,1436031753
2,68,34048,2.0,1158532246
3,68,34048,2.5,1230497715


In [76]:
# 3) 실제 중복 제거: 가장 최신 timestamp의 평점만 남기기
ratings_df = (
    ratings_df
    .sort_values("timestamp")  # 오래된 → 최신 순으로 정렬
    .drop_duplicates(subset=["user_id", "movie_id"], keep="last")
    .reset_index(drop=True)
)

print("\n=== 중복 제거 전/후 row 수 비교 ===")
print(f"- 제거 전: {len(ratings_before)} rows")
print(f"- 제거 후: {len(ratings_df)} rows")


=== 중복 제거 전/후 row 수 비교 ===
- 제거 전: 100789 rows
- 제거 후: 100785 rows


In [77]:
# 4) 같은 쌍에 대해 "중복 제거 후" 어떤 평점이 남았는지 보기
if len(multi_rated_pairs) > 0:
    print("\n=== 중복 제거 후: 같은 (user_id, movie_id) 쌍에 대해 최종 남은 평점 ===")
    after_sample = (
        ratings_df
        .merge(sample_pairs[["user_id", "movie_id"]], on=["user_id", "movie_id"], how="inner")
        .sort_values(["user_id", "movie_id"])
    )

    display(
        after_sample[["user_id", "movie_id", "rating", "timestamp"]]
    )


=== 중복 제거 후: 같은 (user_id, movie_id) 쌍에 대해 최종 남은 평점 ===


Unnamed: 0,user_id,movie_id,rating,timestamp
3,111,6003,4.0,1517441257
1,28,34048,3.5,1234850075
2,509,838,3.5,1436031753
0,68,34048,2.5,1230497715


## 그외 결측치 체크

In [78]:
movies_df.columns[movies_df.isna().any()].tolist()

[]

## 인덱스 정리

In [79]:
# 중복 제거 작업으로 인덱스 불일치 상태
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [80]:
movies_df = movies_df.reset_index(drop=True)
movies_df
# 중요!! 삭제 과정 후에는 항상 reset_index를 해야 한다.(pandas에서 했다면 반드시)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9699,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9700,193585,Flint (2017),Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [81]:
# 중복 제거 작업으로 인덱스 불일치 상태
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615
...,...,...,...,...
100780,514,187031,2.5,1537674927
100781,514,187595,3.0,1537674946
100782,514,5247,2.5,1537757040
100783,514,5246,1.5,1537757059


In [82]:
ratings_df = ratings_df.reset_index(drop=True)
ratings_df
# 중요!! 삭제 과정 후에는 항상 reset_index를 해야 한다.(pandas에서 했다면 반드시)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615
...,...,...,...,...
100780,514,187031,2.5,1537674927
100781,514,187595,3.0,1537674946
100782,514,5247,2.5,1537757040
100783,514,5246,1.5,1537757059


## 구분자 변경

TfidfVectorizer 사용하려면 space 를 seperator 로 써야

In [83]:
# 장르 이름에는 빈칸 없다
# 만약 빈칸 있으면 _ 붙이는 처리 필요
# 예) k drama|fun --> k drama fun  (X)
#                 --> k_drama fun  (O)
movies_df[movies_df['genres'].str.contains(' ')]

Unnamed: 0,movie_id,title,genres


In [84]:
movies_df['genres']

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9698                Action|Animation|Comedy|Fantasy
9699                       Animation|Comedy|Fantasy
9700                                          Drama
9701                               Action|Animation
9702                                         Comedy
Name: genres, Length: 9703, dtype: object

In [85]:
movies_df['genres'].str.replace('\|',' ', regex=True)

0       Adventure Animation Children Comedy Fantasy
1                        Adventure Children Fantasy
2                                    Comedy Romance
3                              Comedy Drama Romance
4                                            Comedy
                           ...                     
9698                Action Animation Comedy Fantasy
9699                       Animation Comedy Fantasy
9700                                          Drama
9701                               Action Animation
9702                                         Comedy
Name: genres, Length: 9703, dtype: object

In [86]:
movies_df['genres'] = movies_df['genres'].str.replace(r'\|',' ', regex=True)
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9700,193585,Flint (2017),Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation


## 연도 결측치 처리

In [87]:
# 제목에 연도 없는 데이터 검색
# movies_df[~movies_df['title'].str.contains('(')]  <-- \( 사용해야함
movies_df[~movies_df['title'].str.contains(r'\(')]

Unnamed: 0,movie_id,title,genres
6058,40697,Babylon 5,Sci-Fi
9023,140956,Ready Player One,Action Sci-Fi Thriller
9163,149334,Nocturnal Animals,Drama Thriller
9345,162414,Moonlight,Drama


웹 검색으로 영화 연도 찾은 결과
```
'Babylon 5': 1994
'Ready Player One': 2018
'Nocturnal Animals': 2017
'Moonlight': 2009
```

In [88]:
tmp = movies_df.copy()

In [89]:
tmp.loc[6058, 'title'] = tmp.loc[6058, 'title'] + " (1994)"
tmp.loc[[6058]]

Unnamed: 0,movie_id,title,genres
6058,40697,Babylon 5 (1994),Sci-Fi


In [90]:
# 중복 실행 시 연도 중첩 방지 필요
tmp.loc[6058, 'title'] = tmp.loc[6058, 'title'] + " (1994)"
tmp.loc[[6058]]

Unnamed: 0,movie_id,title,genres
6058,40697,Babylon 5 (1994) (1994),Sci-Fi


In [91]:
# 연도 없는지 체크
'(' not in tmp.loc[6058, 'title']

False

In [92]:
tmp = movies_df.copy()
idx = 6058
year = 1994

if '(' not in tmp.loc[idx, 'title']:
    tmp.loc[idx, 'title'] = tmp.loc[idx, 'title'] + f" ({year})"

tmp.loc[[idx]]

Unnamed: 0,movie_id,title,genres
6058,40697,Babylon 5 (1994),Sci-Fi


In [93]:
# 직접 입력
idx = 6058
year = 1994
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres
6058,40697,Babylon 5 (1994),Sci-Fi


In [94]:
# 직접 입력
idx = 9023
year = 2018
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres
9023,140956,Ready Player One (2018),Action Sci-Fi Thriller


In [95]:
# 직접 입력
idx = 9163
year = 2017
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres
9163,149334,Nocturnal Animals (2017),Drama Thriller


In [96]:
# 직접 입력
idx = 9345
year = 2009
if '(' not in movies_df.loc[idx, 'title']:
    movies_df.loc[idx, 'title'] = movies_df.loc[idx, 'title'] + f" ({year})"
movies_df.loc[[idx]]

Unnamed: 0,movie_id,title,genres
9345,162414,Moonlight (2009),Drama


In [97]:
# 검산: 제목에 연도 없는 데이터 검색
movies_df[~movies_df['title'].str.contains(r'\(')]

Unnamed: 0,movie_id,title,genres


## 영화명/연도 분리

### 연습

In [98]:
"Waiting to Exhale (1995)".rsplit(' ', 1)

['Waiting to Exhale', '(1995)']

In [99]:
# 여백 주의! split에 영향줌
"Waiting to Exhale (1995) ".rsplit(' ', 1)

['Waiting to Exhale (1995)', '']

In [100]:
"Waiting to Exhale (1995) ".strip()

'Waiting to Exhale (1995)'

In [101]:
"Waiting to Exhale (1995) ".strip().rsplit(' ', 1)

['Waiting to Exhale', '(1995)']

In [102]:
movies_df['title'].str.rsplit(' ', n=1)

0                                [Toy Story, (1995)]
1                                  [Jumanji, (1995)]
2                         [Grumpier Old Men, (1995)]
3                        [Waiting to Exhale, (1995)]
4              [Father of the Bride Part II, (1995)]
                            ...                     
9698    [Black Butler: Book of the Atlantic, (2017)]
9699                 [No Game No Life: Zero, (2017)]
9700                                 [Flint, (2017)]
9701          [Bungo Stray Dogs: Dead Apple, (2018)]
9702          [Andrew Dice Clay: Dice Rules, (1991)]
Name: title, Length: 9703, dtype: object

In [103]:
movies_df['title'].str.rsplit(' ', n=1, expand=True)

Unnamed: 0,0,1
0,Toy Story,(1995)
1,Jumanji,(1995)
2,Grumpier Old Men,(1995)
3,Waiting to Exhale,(1995)
4,Father of the Bride Part II,(1995)
...,...,...
9698,Black Butler: Book of the Atlantic,(2017)
9699,No Game No Life: Zero,(2017)
9700,Flint,(2017)
9701,Bungo Stray Dogs: Dead Apple,(2018)


### 적용

In [104]:
movies_df['title'] = movies_df['title'].str.strip()
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9700,193585,Flint (2017),Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation


In [105]:
movies_df[['title_only', 'year']] = movies_df['title'].str.rsplit(' ', n=1, expand=True)
movies_df

Unnamed: 0,movie_id,title,genres,title_only,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Toy Story,(1995)
1,2,Jumanji (1995),Adventure Children Fantasy,Jumanji,(1995)
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men,(1995)
3,4,Waiting to Exhale (1995),Comedy Drama Romance,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,(1995)
...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,Black Butler: Book of the Atlantic,(2017)
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,No Game No Life: Zero,(2017)
9700,193585,Flint (2017),Drama,Flint,(2017)
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,Bungo Stray Dogs: Dead Apple,(2018)


In [106]:
# 연도에 괄호 제거하고 int 타입 변환
movies_df['year'] = movies_df['year'].apply(lambda x: x[1:-1])
movies_df['year'] = movies_df['year'].astype(int)
movies_df

Unnamed: 0,movie_id,title,genres,title_only,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure Children Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy Drama Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
...,...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,Black Butler: Book of the Atlantic,2017
9699,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,No Game No Life: Zero,2017
9700,193585,Flint (2017),Drama,Flint,2017
9701,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,Bungo Stray Dogs: Dead Apple,2018


In [107]:
# 영화명 같이만 개봉 연도 다른 리메이크작 존재함
# title_only 보다 title 그대로 사용
movies_df = movies_df[['movie_id', 'title', 'year', 'genres']]
movies_df

Unnamed: 0,movie_id,title,year,genres
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),1995,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),1995,Comedy Romance
3,4,Waiting to Exhale (1995),1995,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),2017,Action Animation Comedy Fantasy
9699,193583,No Game No Life: Zero (2017),2017,Animation Comedy Fantasy
9700,193585,Flint (2017),2017,Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),2018,Action Animation


# 데이터 저장

In [108]:
movie_file_path = "data/movies_refined.csv"
movie_file_path

'data/movies_refined.csv'

In [109]:
movies_df.to_csv(movie_file_path, index=False)
movies_df

Unnamed: 0,movie_id,title,year,genres
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),1995,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),1995,Comedy Romance
3,4,Waiting to Exhale (1995),1995,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...,...
9698,193581,Black Butler: Book of the Atlantic (2017),2017,Action Animation Comedy Fantasy
9699,193583,No Game No Life: Zero (2017),2017,Animation Comedy Fantasy
9700,193585,Flint (2017),2017,Drama
9701,193587,Bungo Stray Dogs: Dead Apple (2018),2018,Action Animation


In [110]:
ratings_file_path = "data/ratings_refined.csv"
ratings_file_path

'data/ratings_refined.csv'

In [111]:
ratings_df.to_csv(ratings_file_path, index=False)
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615
...,...,...,...,...
100780,514,187031,2.5,1537674927
100781,514,187595,3.0,1537674946
100782,514,5247,2.5,1537757040
100783,514,5246,1.5,1537757059


# 참고

Timstamp -> Datetime

In [112]:
from datetime import datetime
datetime.fromtimestamp(964982703)

datetime.datetime(2000, 7, 31, 3, 45, 3)

In [113]:
time_unit = 2
datetime.fromtimestamp(964982703 / time_unit)

datetime.datetime(1985, 4, 16, 18, 22, 31, 500000)

In [114]:
from datetime import datetime
tmp = ratings_df.copy()
time_unit = 2
tmp['datetime'] = tmp['timestamp'].apply(lambda x: datetime.fromtimestamp(x/time_unit))
tmp

Unnamed: 0,user_id,movie_id,rating,timestamp,datetime
0,429,420,2.0,828124615,1983-02-14 18:18:27.500
1,429,222,4.0,828124615,1983-02-14 18:18:27.500
2,429,343,3.0,828124615,1983-02-14 18:18:27.500
3,429,349,3.0,828124615,1983-02-14 18:18:27.500
4,429,351,4.0,828124615,1983-02-14 18:18:27.500
...,...,...,...,...,...
100780,514,187031,2.5,1537674927,1994-05-13 22:57:43.500
100781,514,187595,3.0,1537674946,1994-05-13 22:57:53.000
100782,514,5247,2.5,1537757040,1994-05-14 10:22:00.000
100783,514,5246,1.5,1537757059,1994-05-14 10:22:09.500
