In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
movies['genres'] = movies['genres'].str.replace('|',' ')

In [4]:
#limit ratings to user ratings that have rated more than 200 movies
#otherwise it becomes impossible to pivot the rating dataframe later for collaborative filtering

ratings_f = ratings.groupby('userId').filter(lambda x: len(x) >= 200)

#list the movie titles that survived the above filtering
movie_list_rating = ratings_f.movieId.unique().tolist()

In [5]:
#filtering movies data frame
movies = movies[movies.movieId.isin(movie_list_rating)]

# Merge the movies and the tags dataframe and create a metatag for each movie

In [6]:
# creating a mixed dataframe of movies title,genres 
#and all user tags given to each movie

mixed = pd.merge(movies, tags, on='movieId', how='left')
mixed.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1644.0,Watched
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1741.0,computer animation
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1741.0,Disney animated feature
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1741.0,Pixar animation
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1741.0,TÃ©a Leoni does not star in this movie


In [7]:
#create metadata from tags and genres
mixed.fillna("", inplace=True)
mixed = pd.DataFrame(mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))


In [8]:
# here metadata means field containing data of 'tag' and 'genres field combined'
final = pd.merge(movies,mixed,on='movieId',how = 'left')
final['metadata'] = final[['genres','tag']].apply(lambda x: ' '.join(x),axis=1)

In [9]:
final.head()

Unnamed: 0,movieId,title,genres,tag,metadata
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Watched computer animation Disney animated fea...,Adventure Animation Children Comedy Fantasy Wa...
1,2,Jumanji (1995),Adventure Children Fantasy,time travel adapted from:book board game child...,Adventure Children Fantasy time travel adapted...
2,3,Grumpier Old Men (1995),Comedy Romance,old people that is actually funny sequel fever...,Comedy Romance old people that is actually fun...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,chick flick revenge characters chick flick cha...,Comedy Drama Romance chick flick revenge chara...
4,5,Father of the Bride Part II (1995),Comedy,Diane Keaton family sequel Steve Martin weddin...,Comedy Diane Keaton family sequel Steve Martin...


In [10]:
final.drop(['genres'],axis=1,inplace=True)
final.drop(['tag'],axis=1,inplace=True)

In [11]:
final.head(100)

Unnamed: 0,movieId,title,metadata
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy Wa...
1,2,Jumanji (1995),Adventure Children Fantasy time travel adapted...
2,3,Grumpier Old Men (1995),Comedy Romance old people that is actually fun...
3,4,Waiting to Exhale (1995),Comedy Drama Romance chick flick revenge chara...
4,5,Father of the Bride Part II (1995),Comedy Diane Keaton family sequel Steve Martin...
...,...,...,...
95,97,"Hate (Haine, La) (1995)",Crime Drama class conflict angry black and whi...
96,98,Shopping (1994),Action Thriller want to own directorial debut ...
97,99,Heidi Fleiss: Hollywood Madam (1995),Documentary
98,100,City Hall (1996),Drama Thriller corruption Al Pacino Harold Bec...


In [12]:
final.to_csv("final.csv",index=False)

In [13]:
ratings_f1 = pd.merge(movies[['movieId']], ratings_f ,on='movieId',how = 'left')


In [16]:
ratings_f1.head()

Unnamed: 0,movieId,userId,rating
0,1,11,4.5
1,1,14,4.5
2,1,24,4.0
3,1,31,3.0
4,1,53,4.0


In [14]:
ratings_f1.to_csv("ratings_f1.csv",index=False)

In [15]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
