In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load in txt files
df_1 = pd.read_csv('combined_data_1.txt', header = None, names = ['user_id', 'rating'], usecols = [0,1])
df_2 = pd.read_csv('combined_data_2.txt', header = None, names = ['user_id', 'rating'], usecols = [0,1])
df_3 = pd.read_csv('combined_data_3.txt', header = None, names = ['user_id', 'rating'], usecols = [0,1])
df_4 = pd.read_csv('combined_data_4.txt', header = None, names = ['user_id', 'rating'], usecols = [0,1])

In [3]:
# Combine separate files together
df = df_1.append(df_2)
df = df.append(df_3)
df = df.append(df_4)
df = df.reset_index()
df = df.drop(columns=['index'])

In [4]:
# Create a separate dataframe for rows with no rating information
df_nan = pd.DataFrame(pd.isnull(df.rating))
df_nan = df_nan[df_nan['rating'] == True]
df_nan = df_nan.reset_index()

In [5]:
# Create a movie_id numpy array to add into the previous dataframe
movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    temp = np.full((1, i - j - 1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

In [6]:
# This block is essentially the same as the previous block, just to include the last few rows into the array
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

In [7]:
# Get rid of the rows with no rating, those with movie_id information
df = df[pd.notnull(df['rating'])]

# Change each column to correct type
df['movie_id'] = movie_np.astype(int)
df['user_id'] = df['user_id'].astype(int)
df['rating'] = df['rating'].astype(float)

In [8]:
# Just to check the shape of the processed dataframe
print('The dataframe has shape: {}'.format(df.shape))

The dataframe has shape: (100480507, 3)


In [9]:
# Make the movie_id starts from 0
df.movie_id = df.movie_id - 1

# Create a mapping for user_id
unique_user_ids = set(df.user_id.values)
user2idx = {}
count = 0
for user_id in unique_user_ids:
    user2idx[user_id] = count
    count += 1
    
# Apply the mapping to the dataframe
df['user_map'] = df.apply(lambda row: user2idx[row.user_id], axis=1)

In [10]:
# Save as a temporary csv file
df.to_csv('processed_rating.csv', index=False)