In [1]:
import pandas as pd
import itertools
import numpy as np

In [2]:
def movie_preprocessing(movie):
    movie_col = list(movie.columns)
    movie_tag = [doc.split('|') for doc in movie['tag']]
    tag_table = {token: idx for idx, token in enumerate(set(itertools.chain.from_iterable(movie_tag)))}
    movie_tag = pd.DataFrame(movie_tag)
    tag_table = pd.DataFrame(tag_table.items())
    tag_table.columns = ['Tag', 'Index']

    # use one-hot encoding for movie genres (here called tag)
    tag_dummy = np.zeros([len(movie), len(tag_table)])

    for i in range(len(movie)):
        for j in range(len(tag_table)):
            if tag_table['Tag'][j] in list(movie_tag.iloc[i, :]):
                tag_dummy[i, j] = 1

    # combine the tag_dummy one-hot encoding table to original movie files
    movie = pd.concat([movie, pd.DataFrame(tag_dummy)], 1)
    movie_col.extend(['tag' + str(i) for i in range(len(tag_table))])
    movie.columns = movie_col
    movie = movie.drop('tag', 1)
    return movie

In [4]:
movie = pd.read_table('../../data/ml-1m/movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
rating = pd.read_table("../../data/ml-1m/ratings.dat", sep="::", names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
users = pd.read_table("../../data/ml-1m/users.dat", sep="::", names=["user_id", "gender", "age", "occupation", "zip"], engine="python")
movie = movie_preprocessing(movie)
data = pd.merge(rating, movie, on="movie_id")
data = pd.merge(data, users, on="user_id")

In [5]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_name,tag0,tag1,tag2,tag3,tag4,...,tag12,tag13,tag14,tag15,tag16,tag17,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,4211,3791,2,965319075,Footloose (1984),0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M,45,5,77662
1000205,4211,3806,3,965319138,MacKenna's Gold (1969),0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M,45,5,77662
1000206,4211,3840,4,965319197,Pumpkinhead (1988),0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M,45,5,77662
1000207,4211,3766,2,965319138,Missing in Action (1984),0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,M,45,5,77662


In [6]:
data.to_csv("data.csv", index=False)