In [145]:
import pandas as pd
import random
import requests
from bs4 import BeautifulSoup

# Make Comment

In [None]:
ratings = pd.read_csv('database/ratings.csv')

In [None]:
very_good_comments = ['very good!','It\'s my favourite movie in my life!!', '<3<3<3',
                      'It is the rare movie that succeeds on almost every level']
good_comments = ['I enjoy it so much.','I like it.',':)',':D',
                 'I loved the movie far more than I should have, but that doesn\'t mean it is perfect.']
fair_comments = ['I loved the wildly uneven Interstellar far more than I should have, but that doesn\'t mean it is perfect.',
                'Generally it\'s okay','I was looking forward to watching this film, but turned out it\'s just average',
                ':/','fair','not surprising at all','nice view but bad plot','I prefer watching it on the Internet']
bad_comments = [':(((','-_-','Boring','I better pay my dollar to buy myself a pineapple pizza']
very_bad_comments = [':@@@@@@','very bad!','bad plot, bad actors, everything is bad',
                     'I can even make a better plot than this','ridiculous!']

In [None]:
ratings['comments'] = ['']*ratings.shape[0]

In [None]:
for i, rating in enumerate(ratings.rating):
    if rating > 4.5:
        ratings.loc[i,'comments'] = random.choice(very_good_comments)
    elif rating <= 4.5 and rating > 3.5:
        ratings.loc[i,'comments'] = random.choice(good_comments)
    elif rating <= 3.5 and rating >= 3.0:
        ratings.loc[i,'comments'] = random.choice(fair_comments)
    elif rating >= 2.5 and rating < 3.0:
        ratings.loc[i,'comments'] = random.choice(bad_comments)
    else:
        ratings.loc[i,'comments'] = random.choice(very_bad_comments)

In [None]:
ratings.head()

In [None]:
ratings.to_csv('database/ratings_add_cm.csv')

# Reformatting Movies Dataset

In [81]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [82]:
def try_convert_year(x):
    try:
        return int(x.rstrip()[-5:-1])
    except:
        return 0
    
def try_convert_title(x):
    if x.rstrip()[-5:-1].isnumeric():
        return x.rstrip()[:-6]
    else:
        return x
    
movies['year'] = movies.title.apply(try_convert_year)
movies['title'] = movies.title.apply(try_convert_title)

In [83]:
missing_year_movie_id = movies[movies['year'] == 0].movieId.values
missing_year = [1993,2018,2015,1983,2016,2016,2016,2016,1980,2017,2017,2011]
fill_the_missing = dict(zip(missing_year_movie_id,missing_year))

for k,v in fill_the_missing.items():
    movies.loc[movies.movieId==k,'year'] = v

In [140]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [218]:
movies.drop('genres',axis=1).to_csv('movies_reformat.csv')

# Create Genres Dataset

In [88]:
# flatten genres data
movies.genres = movies.genres.apply(lambda x: x.split('|'))

all_genres = movies.genres.to_list()
unique_genres = set([genre for genres in all_genres for genre in genres])

In [104]:
flatdata = pd.DataFrame([(index, value) for (index, values)
                         in movies['genres'].iteritems() for value in values],
                             columns = ['index', 'genres']).set_index('index')
flatdata = flatdata.reset_index() 
flatdata.columns = ['movieId','genres']
flatdata.to_csv('genres.csv')

# Create Users Dataset

In [106]:
!pip install random-username

Collecting random-username
  Downloading random_username-1.0.2-py3-none-any.whl (6.7 kB)
Installing collected packages: random-username
Successfully installed random-username-1.0.2


In [107]:
from random_username.generate import generate_username

In [105]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [116]:
users = pd.DataFrame({'userId': ratings.userId.unique(),
                      'username': generate_username(ratings.userId.nunique()),
                      'password': ['password123']*ratings.userId.nunique()})

In [133]:
# create no duplicated username

nth = {
0: "first",
1: "second",
2: "third",
3: "fourth"
}

mask = users.username.duplicated(keep=False)
users.loc[mask, 'username'] = users[mask].groupby('username').cumcount().map(nth) + users.loc[mask, 'username']

In [137]:
users.username.duplicated().value_counts()

False    610
Name: username, dtype: int64

In [134]:
users.to_csv('users.csv')

# Generate Posters

In [5]:
links = pd.read_csv('ml-latest-small/links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [16]:
imdb_links = []

for imdbId in links.imdbId:
    imdb_links.append(f'https://www.imdb.com/title/tt{imdbId:07d}/')    

In [18]:
links['imdb_link'] = imdb_links

In [193]:
posters = pd.read_csv('MovieGenre.csv', encoding='latin-1')[['imdbId','Poster']]

In [194]:
posters = links.set_index('imdbId').join(posters.set_index('imdbId'))
posters = posters.sort_values(by='movieId')
posters = posters.reset_index()

In [195]:
posters

Unnamed: 0,imdbId,movieId,tmdbId,imdb_link,Poster
0,114709,1,862.0,https://www.imdb.com/title/tt0114709/,https://images-na.ssl-images-amazon.com/images...
1,113497,2,8844.0,https://www.imdb.com/title/tt0113497/,https://images-na.ssl-images-amazon.com/images...
2,113228,3,15602.0,https://www.imdb.com/title/tt0113228/,https://images-na.ssl-images-amazon.com/images...
3,114885,4,31357.0,https://www.imdb.com/title/tt0114885/,https://images-na.ssl-images-amazon.com/images...
4,113041,5,11862.0,https://www.imdb.com/title/tt0113041/,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...
9910,5476944,193581,432131.0,https://www.imdb.com/title/tt5476944/,
9911,5914996,193583,445030.0,https://www.imdb.com/title/tt5914996/,
9912,6397426,193585,479308.0,https://www.imdb.com/title/tt6397426/,
9913,8391976,193587,483455.0,https://www.imdb.com/title/tt8391976/,


In [204]:
bad_request = []

In [205]:
for i, url in tqdm(enumerate(posters.Poster)):
    if posters.movieId.loc[i]<720: continue
    try:
        urllib.request.urlretrieve(url, f"posters/{posters.movieId.loc[i]}.jpg")
    except: # bad request
        bad_request.append(posters.movieId.loc[i])

9915it [33:25,  4.94it/s] 


In [208]:
movies[movies.movieId.isin(bad_request)]

Unnamed: 0,movieId,title,genres,year
586,722,"Haunted World of Edward D. Wood Jr., The",[Documentary],1996
596,741,Ghost in the Shell (Kôkaku kidôtai),"[Animation, Sci-Fi]",1995
625,795,Somebody to Love,[Drama],1994
628,800,Lone Star,"[Drama, Mystery, Western]",1996
631,803,Walking and Talking,"[Comedy, Drama, Romance]",1996
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017
9739,193585,Flint,[Drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018


In [154]:
from tqdm import tqdm
import urllib.request

In [200]:
html_page = requests.get('https://www.imdb.com/title/tt0117108/')
soup = BeautifulSoup(html_page.content, 'html.parser')
urllib.request.urlretrieve(soup.img['src'], f"posters/{links.movieId.loc[i]}.jpg")

('posters/722.jpg', <http.client.HTTPMessage at 0x161e4b9c700>)

In [213]:
# Scrap Poster
bad_request_2 = []

for i, url in tqdm(enumerate(links.imdb_link)):
    if links.movieId.loc[i] in bad_request:
        try:
            html_page = requests.get(url)
            soup = BeautifulSoup(html_page.content, 'html.parser')
            urllib.request.urlretrieve(soup.img['src'], f"posters/{links.movieId.loc[i]}.jpg")
        except:
            bad_request_2.append(links.movieId.loc[i])

9742it [1:26:48,  1.87it/s]


In [216]:
movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017
9739,193585,Flint,[Drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018
