In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
def readInputFile(filename):
    dataframe = []
    with open(filename,"r") as f:
        movieId = None
        while True:
            line = f.readline()
            if not line:
                break

            if line[-1] == '\n':
                line = line[:-1]

            if line[-1] == ':':
                movieId = int(line[:-1])
            else:
                userId = int(line.split(',')[0])
                rating = float(line.split(',')[1])
                dataframe.append([userId, movieId, rating])

    dataframe = np.array(dataframe)
    return pd.DataFrame(data = dataframe, columns = ['User', 'Movie', 'Rating']).astype({'User':int, 'Movie':int, 'Rating':float})

In [3]:
df1 = readInputFile('data/combined_data_1.txt')
df2 = readInputFile('data/combined_data_2.txt')
df3 = readInputFile('data/combined_data_3.txt')
df4 = readInputFile('data/combined_data_4.txt')
movieRatings = pd.concat([df1, df2, df3, df4], ignore_index=True)
movieRatings.head()

Unnamed: 0,User,Movie,Rating
0,1488844,1,3.0
1,822109,1,5.0
2,885013,1,4.0
3,30878,1,4.0
4,823519,1,3.0


In [4]:
# Check for shape of dataframe. The rows 
# indicatecthe number of ratings (approx 100M)
movieRatings.shape

(100480507, 3)

In [5]:
# Check for any NaNs in the dataframe
movieRatings.isnull().values.any()

False

In [6]:
movieRatings['User'].max()

2649429

In [7]:
movieRatings['Movie'].max()

17770

In [8]:
# Check number of distinct users and movies
# There are over 480k users and over 17k movies
movieRatings[['User', 'Movie']].nunique()

User     480189
Movie     17770
dtype: int64

In [9]:
# See counts of number of rating by each user
# Each user has rated atleast 1 movie
movieRatings['User'].value_counts()

305344     17653
387418     17436
2439493    16565
1664010    15813
2118461    14831
           ...  
1629999        1
1741566        1
1472717        1
2381142        1
1744057        1
Name: User, Length: 480189, dtype: int64

In [10]:
# See counts of number of times each movie has been rated
# Each movie has been rated atleast by 3 users
movieRatings['Movie'].value_counts()

5317     232944
15124    216596
14313    200832
15205    196397
1905     193941
          ...  
4806         13
11344        10
6256         10
11148         5
13755         3
Name: Movie, Length: 17770, dtype: int64

In [11]:
# Analyse the distribution of ratings
movieRatings['Rating'].value_counts(normalize=True, sort=False)

2.0    0.100836
3.0    0.286735
4.0    0.335896
5.0    0.230574
1.0    0.045959
Name: Rating, dtype: float64