In [118]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os, time, random
from pathlib import Path

In [119]:
cur_dir = Path('.').absolute()
data_dir=cur_dir.parent/ 'data'

In [120]:
list(data_dir.iterdir())

[PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/tag.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/movie.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/link.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/rating.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/movies.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/genome_tags.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system(collaborati/data/genome_scores.csv')]

## Exploring Movie df

In [185]:
movie_df = pd.read_csv(data_dir/'movie.csv')

In [186]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [187]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [188]:
movie_df.isna().sum() # no nulls

movieId    0
title      0
genres     0
dtype: int64

In [189]:
movie_df.duplicated().sum() #no duplicates

0

In [190]:
movie_df.title.nunique()

27262

In [191]:

# Calculate the value counts for each movie title
title_value_counts = movie_df['title'].value_counts()

# Filter titles that appear more than once
duplicate_titles = title_value_counts[title_value_counts > 1].index.tolist()

print(duplicate_titles)

['Aladdin (1992)', 'Johnny Express (2014)', 'Chaos (2005)', 'Hamlet (2000)', '20,000 Leagues Under the Sea (1997)', 'Darling (2007)', 'Casanova (2005)', 'Paradise (2013)', 'Beneath (2013)', 'Girl, The (2012)', 'Clear History (2013)', 'Emma (1996)', 'Offside (2006)', 'Blackout (2007)', 'Men with Guns (1997)', 'War of the Worlds (2005)']


Some movies have multiple entries with different `movieid` , but it doesn't affect much 

In [192]:
movie_df.genres.head()

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object

so in genres column there seems to have no spaces bw genres, lets see want unique genres as there

In [193]:
all_genres = movie_df.genres.apply(lambda x : ' '.join(str(x).split('|'))).values.tolist() # split from |
all_genres = ' '.join(set(all_genres)).split() # join all strings and break them into words
all_genres = set(all_genres)  # make a set to find unique ones

In [194]:
print(all_genres, len(all_genres))

{'Animation', 'Children', 'Western', 'Mystery', 'Romance', 'Crime', 'Adventure', '(no', 'Documentary', 'Musical', 'War', 'IMAX', 'Sci-Fi', 'Horror', 'Comedy', 'Thriller', 'Film-Noir', 'Drama', 'listed)', 'Fantasy', 'genres', 'Action'} 22


there are 20 genres and 1 for movies with no genre (which is (no listed)) which is broken as '(no' and 'listed)'

In [195]:
movie_df['tags'] = movie_df['title'] + ' ' + movie_df['genres'].apply(lambda x: ' '.join(str(x).split('|')))

In [196]:
# movie_df['Title'] = movie_df.title.apply(lambda x : str(x).split('(')[0])
# movie_df['Year'] = movie_df.title.apply(lambda x : (str(x).split('(')[-1]).strip(')'))

In [197]:
movie_df.sample(22)

Unnamed: 0,movieId,title,genres,tags
14524,72698,Out of Reach (2004),Action|Drama|Romance|Thriller,Out of Reach (2004) Action Drama Romance Thriller
19845,98022,Sleepless Night (Nuit blanche) (2011),Action|Crime|Thriller,Sleepless Night (Nuit blanche) (2011) Action C...
21511,104356,Museum Hours (2013),Drama,Museum Hours (2013) Drama
3642,3733,"Paper Chase, The (1973)",Drama,"Paper Chase, The (1973) Drama"
9638,31090,Fuhrer Ex (Führer EX) (2002),Drama,Fuhrer Ex (Führer EX) (2002) Drama
5693,5792,Roger Dodger (2002),Comedy|Drama,Roger Dodger (2002) Comedy Drama
14788,73983,"Maid, The (Nana, La) (2009)",Drama,"Maid, The (Nana, La) (2009) Drama"
19201,95504,Virtual JFK: Vietnam If Kennedy Had Lived (2008),Documentary,Virtual JFK: Vietnam If Kennedy Had Lived (200...
21484,104249,Mr. Moto in Danger Island (1939),Crime|Drama|Mystery|Thriller,Mr. Moto in Danger Island (1939) Crime Drama M...
26290,126186,The Sex and Violence Family Hour (1983),(no genres listed),The Sex and Violence Family Hour (1983) (no ge...


## Exploraing User data

In [203]:
user_df = pd.read_csv(data_dir/'rating.csv', usecols=['userId','movieId','rating'])

In [213]:
# this columns are using too much precision for very low values, lowering the datatype precision
user_df['movieId'] = user_df['movieId'].astype('int16')
user_df['userId'] = user_df['userId'].astype('int16')
user_df['rating'] = user_df['rating'].astype('float16')
print()




In [208]:
user_df.shape #(20000263,4)

(20000263, 3)

In [214]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int16  
 1   movieId  int16  
 2   rating   float16
dtypes: float16(1), int16(2)
memory usage: 114.4 MB


In [216]:
user_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
