## MovieLensデータ整形

In [1]:
import pandas as pd
import numpy as np

import os
import pickle as pkl

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 10)

### user情報
---
```
u.user     -- Demographic information about the users; this is a tab
              separated list of
              user id | age | gender | occupation | zip code
              The user ids are the ones used in the u.data data set.
```

In [12]:
## userのデモグラ [年齢, 性別, 職業, 郵便番号]
df_user = pd.read_csv('./origin_data/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
df_user

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [3]:
df_user.zip_code = df_user.zip_code.apply(lambda x: None if x is None else str(x)[:2])


df_user.groupby('zip_code').count()

Unnamed: 0_level_0,user_id,age,gender,occupation
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00,2,2,2,2
01,20,20,20,20
02,24,24,24,24
03,6,6,6,6
04,2,2,2,2
...,...,...,...,...
V0,2,2,2,2
V1,1,1,1,1
V3,1,1,1,1
V5,1,1,1,1


#### user vecの作成保存

In [4]:
df_user.zip_code = df_user.zip_code.apply(lambda x: None if x is None else str(x)[:2])
df_user = pd.get_dummies(df_user[['age','gender','occupation','zip_code']])

# age → 正規化(最大最小)
df_age = df_user[['age']]
df_user.age = (df_age - df_age.min()) / (df_age.max() - df_age.min())

user_prof_vecs = df_user.values
with open("./user_profile.arr", 'wb') as f:
    pkl.dump(user_prof_vecs, f)
    
df_user

Unnamed: 0,age,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,occupation_homemaker,occupation_lawyer,occupation_librarian,occupation_marketing,...,zip_code_E2,zip_code_K7,zip_code_L1,zip_code_L9,zip_code_M4,zip_code_M7,zip_code_N2,zip_code_N4,zip_code_R3,zip_code_T8,zip_code_V0,zip_code_V1,zip_code_V3,zip_code_V5,zip_code_Y1
0,0.257576,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.696970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.242424,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.257576,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.393939,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.287879,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
939,0.378788,0,1,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
940,0.196970,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
941,0.621212,1,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
user_prof_vecs

array([[0.25757576, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.6969697 , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.24242424, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.1969697 , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.62121212, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.22727273, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ]])

### item情報
---
```
u.item     -- Information about the items (movies); this is a tab separated
              list of
              movie id | movie title | release date | video release date |
              IMDb URL | unknown | Action | Adventure | Animation |
              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western |
              The last 19 fields are the genres, a 1 indicates the movie
              is of that genre, a 0 indicates it is not; movies can be in
              several genres at once.
              The movie ids are the ones used in the u.data data set.
```

In [5]:
## movie info [リリース日, ジャンル]
col_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 
             'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
df_item = pd.read_csv('./origin_data/u.item', sep='|', names=col_names, encoding = "latin1")
df_item

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### item vecの作成保存

In [6]:
df_release_date = df_item.release_date.apply(lambda x: pd.Series(str(x).split('-')))
df_release_date.columns = ['day', 'month', 'year']
df_item = pd.concat([df_item, df_release_date], axis=1)
df_item.year = df_item.year.apply(lambda x: None if x is None else str(x)[:-1])
df_item = df_item.drop(columns=['movie_id','movie_title', 'video_release_date', 'imdb_url', 'release_date', 'day', 'month', 'unknown'])
df_item = pd.get_dummies(df_item)
df_item = df_item.drop(columns='year_na')

item_prof_vecs = df_item.values
with open("./item_profile.arr", 'wb') as f:
    pkl.dump(item_prof_vecs, f)

df_item

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year_192,year_193,year_194,year_195,year_196,year_197,year_198,year_199
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1678,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1
1679,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1680,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### rating情報
---
```
u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.
              Each user has rated at least 20 movies.  Users and items are
              numbered consecutively from 1.  The data is randomly
              ordered. This is a tab separated list of 
	         user id | item id | rating | timestamp. 
              The time stamps are unix seconds since 1/1/1970 UTC
```

In [7]:
# 評価値データ
df_data = pd.read_csv('./origin_data/u.data', sep='\t', names= ['user_id', 'movie_id', 'rating', 'unix_sec'])
df_data['datetime'] = pd.to_datetime(df_data['unix_sec'],unit='s')
df_data

Unnamed: 0,user_id,movie_id,rating,unix_sec,datetime
0,196,242,3,881250949,1997-12-04 15:55:49
1,186,302,3,891717742,1998-04-04 19:22:22
2,22,377,1,878887116,1997-11-07 07:18:36
3,244,51,2,880606923,1997-11-27 05:02:03
4,166,346,1,886397596,1998-02-02 05:33:16
...,...,...,...,...,...
99995,880,476,3,880175444,1997-11-22 05:10:44
99996,716,204,5,879795543,1997-11-17 19:39:03
99997,276,1090,1,874795795,1997-09-20 22:49:55
99998,13,225,2,882399156,1997-12-17 22:52:36


In [19]:
df_data.dtypes

user_id              int64
movie_id             int64
rating               int64
unix_sec             int64
datetime    datetime64[ns]
dtype: object

In [20]:
df_data.groupby('movie_id').count()

Unnamed: 0_level_0,user_id,rating,unix_sec,datetime
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,452,452,452,452
2,131,131,131,131
3,90,90,90,90
4,209,209,209,209
5,86,86,86,86
...,...,...,...,...
1678,1,1,1,1
1679,1,1,1,1
1680,1,1,1,1
1681,1,1,1,1


In [31]:
df_data.groupby('unix_sec').count()
## max: 893286638, min: 874724710

Unnamed: 0_level_0,user_id,movie_id,rating,datetime
unix_sec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
874724710,1,1,1,1
874724727,1,1,1,1
874724754,1,1,1,1
874724781,1,1,1,1
874724843,1,1,1,1
...,...,...,...,...
893286550,3,3,3,3
893286584,1,1,1,1
893286603,1,1,1,1
893286637,3,3,3,3


### 評価値matrixの作成

In [8]:
np_matrix = np.zeros([len(df_user), len(df_item)])
np_matrix.shape

(943, 1682)

In [9]:
for _, value in df_data.iterrows():
    np_matrix[value['user_id']-1][value['movie_id']-1] = value['rating']
    
# df_matrix = pd.DataFrame(np_matrix, index=[i for i in range(1,len(df_user)+1)], columns=[i for i in range(1,len(df_item)+1)])
df_matrix = pd.DataFrame(np_matrix)
df_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,2.0,5.0,5.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,0.0,4.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### train, test用matrixデータの作成~保存

In [11]:
## create dataset 1~5

for case_num in range(1,6):    
    df_base = pd.read_csv('./origin_data/u{}.base'.format(case_num), sep='\t', usecols=range(3), names= ['user_id', 'movie_id', 'rating'])
    df_test = pd.read_csv('./origin_data/u{}.test'.format(case_num), sep='\t', usecols=range(3), names= ['user_id', 'movie_id', 'rating'])
    df_base['user_id'] -= 1
    df_base['movie_id'] -= 1
    df_test['user_id'] -= 1
    df_test['movie_id'] -= 1
    
    np_matrix = np.zeros([len(df_user), len(df_item)])
    for _, value in df_base.iterrows():
        np_matrix[value['user_id']][value['movie_id']] = value['rating']
    df_matrix = pd.DataFrame(np_matrix)
    
    ## save data
    dir_name = "./valcase_{}".format(case_num)
    try:
        os.makedirs(dir_name)
    except FileExistsError:
        pass
    df_matrix.to_csv("{}/matrix.csv".format(dir_name), header=False, index=False)
    df_base.to_csv("{}/train_data.csv".format(dir_name), header=False, index=False)
    df_test.to_csv("{}/test_data.csv".format(dir_name), header=False, index=False)

In [22]:
df_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5.0,3.0,0.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,2.0,5.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,2.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_base

Unnamed: 0,user_id,movie_id,rating
0,0,0,5
1,0,1,3
2,0,3,3
3,0,4,3
4,0,5,5
...,...,...,...
79995,942,942,5
79996,942,1010,2
79997,942,1066,2
79998,942,1073,4


In [24]:
df_test

Unnamed: 0,user_id,movie_id,rating
0,0,2,4
1,0,12,5
2,0,14,5
3,0,17,4
4,0,18,5
...,...,...,...
19995,942,1027,2
19996,942,1043,3
19997,942,1046,2
19998,942,1227,3


### Factorization Machines用

In [33]:
for case_num in range(1,6):    
    df_base = pd.read_csv('./origin_data/u{}.base'.format(case_num), sep='\t', names= ['user_id', 'movie_id', 'rating', 'unix_sec'])
    df_test = pd.read_csv('./origin_data/u{}.test'.format(case_num), sep='\t', names= ['user_id', 'movie_id', 'rating', 'unix_sec'])
    df_base['user_id'] -= 1
    df_base['movie_id'] -= 1
    df_test['user_id'] -= 1
    df_test['movie_id'] -= 1
    
    df_base['unix_sec'] = (df_base['unix_sec']-874724710)/18561928
    df_test['unix_sec'] = (df_test['unix_sec']-874724710)/18561928
    
    dir_name = "./valcase_{}".format(case_num)
    df_base.to_csv("{}/train_data_with_datetime.csv".format(dir_name), header=False, index=False)
    df_test.to_csv("{}/test_data_with_datetime.csv".format(dir_name), header=False, index=False)

In [26]:
## userのデモグラ [年齢, 性別, 職業, 郵便番号]
df_user = pd.read_csv('./origin_data/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
df_user['user_id'] -= 1

df_user.zip_code = df_user.zip_code.apply(lambda x: None if x is None else str(x)[:2])

# age → 正規化(最大最小)
df_age = df_user[['age']]
df_user.age = (df_age - df_age.min()) / (df_age.max() - df_age.min())

df_user

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,0,0.257576,M,technician,85
1,1,0.696970,F,other,94
2,2,0.242424,M,writer,32
3,3,0.257576,M,technician,43
4,4,0.393939,F,other,15
...,...,...,...,...,...
938,938,0.287879,F,student,33
939,939,0.378788,M,administrator,02
940,940,0.196970,M,student,97
941,941,0.621212,F,librarian,78


In [27]:
## movie info [リリース日, ジャンル]
col_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 
             'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
df_item = pd.read_csv('./origin_data/u.item', sep='|', names=col_names, encoding = "latin1")
df_item['movie_id'] -= 1

## release_dateのカテゴリ化
df_release_date = df_item.release_date.apply(lambda x: pd.Series(str(x).split('-')))
df_release_date.columns = ['day', 'month', 'year']
df_item = pd.concat([df_item, df_release_date], axis=1)
df_item.year = df_item.year.apply(lambda x: None if x is None else str(x)[:-1])

## 不要なカラム削除
df_item = df_item.drop(columns=['movie_title', 'video_release_date', 'imdb_url', 'release_date', 'day', 'month', 'unknown'])

df_item

Unnamed: 0,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,199
1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,199
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,199
3,3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,199
4,4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1677,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,199
1678,1678,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,199
1679,1679,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,199
1680,1680,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,199


In [28]:
df_user.to_csv("./user_info.csv")
df_item.to_csv("./item_info.csv")