In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load original data from movieLens

In [2]:
data_directory = "../../Downloads/ml-latest/"
movieFile = "movies.csv"
ratingFile = "ratings.csv"

In [3]:
df_rating = pd.read_csv(data_directory + ratingFile, dtype = {'timestamp': 'Int64'})
df_movie = pd.read_csv(data_directory + movieFile)

In [4]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [5]:
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Target dataset

## movies table
### movie_id, imdb_id, title, genres, imdb_url, poster_url

## ratings table
### rater_id, movie_id, rating, timestamp

# Scrape info from IMDB

In [None]:
movieDatasetFile = 'movie_full_dataset.csv'
with open(data_directory + movieDatasetFile, 'w', newline='') as out_csv:
    writer = csv.writer(out_csv, delimiter=',')
    writer.writerow(['movie_id', 'imdb_id', 'title', 'genres', 'imdb_url', 'poster_url'])
endPoint = -1

In [None]:
%%time
total = len(df_movie)
count = endingPoint
with open(data_directory + movieDatasetFile, 'a', newline='') as out_csv:
    writer = csv.writer(out_csv, delimiter=',')
    for i in range(len(df_movie))[endingPoint + 1 :]:
        row = df_movie.iloc[i]
        movieId = row['movieId']
        title = row['title']
        genres = row['genres']
        domain = 'http://www.imdb.com'
        search_url = domain + '/find?q=' + urllib.parse.quote_plus(title)
        count += 1
        if (count % 100 == 0):
            print(f'{count} processing, {total - count} left')
        with urllib.request.urlopen(search_url) as response:
            html = response.read()
            soup = BeautifulSoup(html, 'html.parser')
            # Get url of 1st search result
            try:
                foundTitle = soup.find('table', class_='findList').tr.a['href']
                movieUrl = domain + foundTitle
                imdbId = int(foundTitle.split('/')[2][2:])
                with urllib.request.urlopen(movieUrl) as response:
                    html = response.read()
                    soup = BeautifulSoup(html, 'html.parser')
                    # Get url of poster image
                    try:
                        posterUrl = soup.find('div', class_='poster').a.img['src']
                        extension = '.jpg'
                        posterUrl = ''.join(posterUrl.partition('_')[0]) + extension
                        writer.writerow([movieId, imdbId, title, genres, movieUrl, posterUrl])
                    # Ignore cases where search returns no results
                    except AttributeError:
                        pass
            # Ignore cases where search returns no results
            except AttributeError:
                pass

### In case connection closed by IMDB host, get breakpoint, re-run the upper cell to keep scraping.

In [None]:
# URLError: <urlopen error [Errno 54] Connection reset by peer>
movieDatasetFile = 'movie_full_dataset.csv'
df_movieDataset = pd.read_csv(data_directory + movieDatasetFile)
endingId = df_movieDataset.iloc[len(df_movieDataset) - 1]['movie_id']
for i in range(len(df_movie)):
    row = df_movie.iloc[i]
    movieId = row['movieId']
    if movieId == endingId:
        print(f"found ending point {i}th row")
        break
endingPoint = i

In [7]:
movieDatasetFile = 'movie_full_dataset.csv'
df_movieDataset = pd.read_csv(data_directory + movieDatasetFile)
print(f'yield {len(df_movieDataset)} rows with poster urls from {len(df_movie)} rows in original dataset')

yield 53806 rows with poster urls from 58098 rows in original dataset


# Normalize ratings using Decoupling Normalization

In [None]:
allRaters = np.unique(df_rating['userId'])

In [None]:
ratingsPerRater = {rater: [] for rater in allRaters}
for i in range(len(df_rating)):
    row = df_rating.iloc[i]
    ratingsPerRater[row['userId']].append(row['rating'])

In [None]:
POSSIBLE_RATINGS = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
def getProb(ratings, r):
    p1 = np.sum([rating < r for rating in ratings]) / len(ratings)
    p2 = np.sum([rating == r for rating in ratings]) / len(ratings)
    return p1 + p2 / 2

def getDecoupledRatings(ratings):
    prob = {r : getProb(ratings, r) for r in POSSIBLE_RATINGS}
    decoupledRatings = [prob[rating] for rating in ratings]
    return decoupledRatings

def getGaussianNormalizedRatings(ratings):
    mean = np.mean(ratings)
    std = np.std(ratings)
    ratings = np.array(ratings)
    return (ratings - mean) / std
        

In [None]:
decoupledRatingPerrater = {rater: getDecoupledRatings(ratingsPerRater[rater]) for rater in allRaters}

In [None]:
otherInfosPerRater = {rater: [] for rater in allRaters}
for i in range(len(df_rating)):
    row = df_rating.iloc[i]
    otherInfosPerRater[row['userId']].append((row['movieId'], row['timestamp']))

In [None]:
ratingDatasetFile = 'rating_full_dataset_normalized.csv'
with open(data_directory + ratingDatasetFile, 'w', newline='') as out_csv:
    writer = csv.writer(out_csv, delimiter=',')
    writer.writerow(['rater_id', 'movie_id', 'rating', 'timestamp'])

In [None]:
%%time
total = len(allRaters)
count = 0
with open(data_directory + ratingDatasetFile, 'a', newline='') as out_csv:
    writer = csv.writer(out_csv, delimiter=',')
    for rater in allRaters:
        if len(otherInfosPerRater[rater]) != len(decoupledRatingPerrater[rater]):
            print(f"error! rater: {rater} len_info {len(otherInfosPerRater[rater])} len_decoupled {len(decoupledRatingPerrater[rater])}")

        for i in range(len(otherInfosPerRater[rater])):
            rating = decoupledRatingPerrater[rater][i]
            movieId = otherInfosPerRater[rater][i][0]
            timestamp = otherInfosPerRater[rater][i][1]
            writer.writerow([rater, movieId, rating - 0.5, timestamp])
    
        count += 1
        if count % 100 == 0:
            print(f'processed {count} raters, {total - count} left')

In [8]:
ratingDatasetFile = 'rating_full_dataset_normalized.csv'
df_ratingDataset = pd.read_csv(data_directory + ratingDatasetFile)

# Only preserve ratings that are associated with a movie present in movie database

In [48]:
movieIds = list(df_movieDataset['movie_id'])
print(len(movieIds), len(np.unique(movieIds)))

53806 53806


In [49]:
df_ratingDataset_filtered = df_ratingDataset[df_ratingDataset['movie_id'].isin(movieIds)]

In [50]:
print(f'yield {len(df_ratingDataset_filtered)} rows from {len(df_rating)} rows in original dataset')

yield 25451571 rows from 27753444 rows in original dataset


In [42]:
ratingDatasetFile_filtered = 'rating_full_dataset_normalized_filtered.csv'
df_ratingDataset_filtered.to_csv(data_directory + ratingDatasetFile_filtered, index = False)

In [60]:
print(len(np.unique(df_ratingDataset_filtered['movie_id'])))

49800


# Only preserve movies that have ratings in rating database

In [62]:
moviesWithRatings = list(np.unique(df_ratingDataset_filtered['movie_id']))

In [63]:
df_movieDataset_filtered = df_movieDataset[df_movieDataset['movie_id'].isin(moviesWithRatings)]
print(f'{len(df_movieDataset_filtered)} out of {len(df_movieDataset)} movies have ratings in rating dataset')

49800 out of 53806 movies have ratings in rating dataset


In [64]:
movieDatasetFile_filtered = 'movie_full_dataset_filtered.csv'
df_movieDataset_filtered.to_csv(data_directory + movieDatasetFile_filtered, index = False)