Thinking of APR. Are we doing the dual optimization with the classification model + the pertubation added?

To do: think of how to split data to train/test/ tune 80/10/10 --> Done. Ready to run experiment

In [95]:
#import relevant library

import math
import os
import random
import pickle
import argparse
from collections import deque
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
# from torchvision import transforms
from torch.utils.data import IterableDataset, DataLoader, get_worker_info
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



In [45]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:
## time the process
def get_time_dif(start_time):
    """get the running time"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [None]:

## set up the u,i,j triplet for BPR framework
class GetTriplePair(IterableDataset):
    # for ml-1m we load in 3760 item 6040 user and 994169 train pair
    def __init__(self, item_size, user_list, pair, shuffle, num_epochs):
        self.item_size = item_size
        self.user_list = user_list
        self.pair = pair
        self.shuffle = shuffle
        self.num_epochs = num_epochs

    def __iter__(self):
        self.example_size = self.num_epochs * len(self.pair)
        self.example_index_queue = deque([])
        self.seed = 0
        self.start_list_index = None
        self.num_workers = 1
        self.index = 0
        return self

    def __next__(self):
        if self.index >= self.example_size:
            raise StopIteration
        # If `example_index_queue` is used up, replenish this list.
        while len(self.example_index_queue) == 0:
            index_list = list(range(len(self.pair)))
            if self.shuffle:
                random.Random(self.seed).shuffle(index_list)
                self.seed += 1
            if self.start_list_index is not None:
                index_list = index_list[self.start_list_index::self.num_workers]

                # Calculate next start index
                self.start_list_index = (self.start_list_index + (self.num_workers - (len(self.pair) % self.num_workers))) % self.num_workers
            self.example_index_queue.extend(index_list)
        result = self._example(self.example_index_queue.popleft())
        self.index += self.num_workers
        return result

    def _example(self, idx):
        # in a train pair, format = (u,i), j = a random item which does not exist in user u's list of items
        u = self.pair[idx][0]
        i = self.pair[idx][1]
        j = np.random.randint(self.item_size)
        while j in self.user_list[u]:
            j = np.random.randint(self.item_size)
        return u, i, j


In [96]:
class Adversary(torch.nn.Module):
    def __init__(self, input_dim, output_dim, with_y=False, with_logits=False, use_mlp=False, with_logits_y=False,
                 with_single_y=False):
        super(Adversary4Z, self).__init__() #inherit properties
        self.c = torch.nn.Parameter(torch.tensor(1.0, requires_grad=True)) #initialize parameters to optimize for
        self.input_dim = input_dim #number of labels
        self.with_y = with_y #number of groups
        self.with_logits = with_logits 
        self.with_logits_y = with_logits_y
        self.with_single_y = with_single_y

        # the basic input = [s], to determine which defition of fairness we are going with
        if self.with_logits:  # input = concat([input, logits])
            self.input_dim += input_dim
        if self.with_y:  # input = concat([input, s*y, s*(1-y)])
            self.input_dim += input_dim * 2
        if self.with_logits_y:  # input = concat([input, logits*y, logits*(1-y)])
            self.input_dim += input_dim * 2
        if self.with_single_y:  # input = concat([input, y])
            self.input_dim += 1

        self.use_mlp = use_mlp
        # basically its a mlp that predict group from input dimension
        if self.use_mlp:
            hidden_dim = [128, 128]
            hidden_dim = [self.input_dim] + list(hidden_dim) + [output_dim]
            self.seq = torch.nn.ModuleList() #this is just like a python list but an instance of submodule
            for i in range(1, len(hidden_dim)):
                self.seq.append(torch.nn.Linear(hidden_dim[i - 1], hidden_dim[i]))
                if i != (len(hidden_dim) - 1):
                    self.seq.append(torch.nn.ReLU())
        else:
            self.fc = torch.nn.Linear(self.input_dim, output_dim, bias=True)
            
    # Forward defines the computation in the model
    def forward(self, inputs, *, y=None):
        assert len(inputs.shape) == 2 #test if shape of input is 2 else raise assertion error

        #define for binary -- use sigmoid and multiclass -- use softmax adversary
        if inputs.shape[1] > 1:
            s = torch.softmax(inputs * (1 + torch.abs(self.c)), dim=-1)
            if self.with_y or self.with_logits_y or self.with_single_y:
                raise NotImplementedError  # only support binary case with y
        else:
            s = torch.sigmoid(inputs * (1 + torch.abs(self.c)))

        if self.with_y:
            assert y is not None #if adversary test with y then there must be an available value for y
            _y = y.view(-1, 1).long() # -1 is infered from other dim, if dim = n x n then view (-1,1) transform to n x 1
            assert len(_y) == inputs.shape[0] #size of y must be compatible with the input
            encoded_inputs = torch.cat([s, s * _y, s * (1 - _y)], dim=1)
        else:
            assert y is None
            encoded_inputs = s

        if self.with_logits:
            encoded_inputs = torch.cat([encoded_inputs, inputs], dim=1)

        if self.with_logits_y:
            assert y is not None
            _y = y.view(-1, 1).long()
            assert len(_y) == inputs.shape[0]
            encoded_inputs = torch.cat([encoded_inputs, inputs * _y, inputs * (1 - _y)], dim=1)

        if self.with_single_y:
            assert y is not None
            _y = y.view(-1, 1).long()
            assert len(_y) == inputs.shape[0]
            encoded_inputs = torch.cat([encoded_inputs, _y], dim=1)

        if self.use_mlp:
            logits = encoded_inputs
            for i, l in enumerate(self.seq):
                logits = l(logits)
        else:
            logits = self.fc(encoded_inputs)

        return logits


In [75]:
hidden_dim = [128, 128]
hidden_dim = [2] + list(hidden_dim) + [2]
seq = torch.nn.ModuleList() #this is just like a python list but an instance of submodule
for i in range(1, len(hidden_dim)):
    seq.append(torch.nn.Linear(hidden_dim[i - 1], hidden_dim[i]))
    if i != (len(hidden_dim) - 1):
        seq.append(torch.nn.ReLU())

In [76]:
seq

ModuleList(
  (0): Linear(in_features=2, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=128, bias=True)
  (3): ReLU()
  (4): Linear(in_features=128, out_features=2, bias=True)
)

In [54]:

## chunk to define matrix factorization part
class fair_reprogram(nn.Module):
    def __init__(self, user_emb, item_emb, dim, reg, reg_adv, eps):
        super().__init__()
        ##init the embedding for U and I
        self.user_emb = user_emb  # User embedding taken from the pre-trained model (retrieved from checkpoint)
        self.item_emb = item_emb  # Item embedding taken from the pre-trained model (retrieved from checkpoint)
        self.reg = reg
        self.dim = dim
        self.reg_adv = reg_adv
        self.eps = eps
        self.update_u = None
        self.update_i = None
        self.update_j = None

## we first initialize an mlp that capable of tracing the the age or gender of the users
    def forward(self, u, i, epoch):

        ##u,i,j respectively, each is a vector of dim embedding (default = 64), retrieve embedding from previous checkpoint
        u = self.user_emb[u, :]
        i = self.item_emb[i, :]


        ## predicted score for ui uj
        x_ui = torch.mul(u, i).sum(dim=1)

        ## predict the gender from the predicted scores and from the true y value 
        y_logits = adv(x_ui) # model outputs raw logits 
        y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels
        
        ## original bpr loss,
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(y_logits, y_blob_train)

        loss.backward()
        return loss
        # add adv training after a certain number of epochs, here is the part which we add hypernet module
        if epoch not in range(args.epochs, args.adv_epoch + args.epochs):
            """Normal training"""
            loss.backward()
            return loss

#         else:
#             """Adversarial training:
#                     1.Backward to get grads
#                     2.Construct adversarial perturbation
#                     3.Add adversarial perturbation to embeddings
#                     4.Calculate APR loss
#             """
#             # Backward to get grads
#             # this would be the part we change in defining delta, delta = HPN (phi)

#             # should we calculate based on gradient of the adv_loss instead of the loss function?, originally, computed based on loss function
#             loss.backward(retain_graph=True) ## need to retain graph here so as to we can backprop the adv_loss
#             ##recheck this
#             grad_u = u.grad
#             grad_i = i.grad
#             grad_j = j.grad

#             # Construct adversarial perturbation based on gradient of loss function, and normalize it with epsilon * norm
#             if grad_u is not None:
#                 delta_u = nn.functional.normalize(grad_u, p=2, dim=1, eps=self.eps)
#             else:
#                 delta_u = torch.rand(u.size())
#             if grad_i is not None:
#                 delta_i = nn.functional.normalize(grad_i, p=2, dim=1, eps=self.eps)
#             else:
#                 delta_i = torch.rand(i.size())
#             if grad_j is not None:
#                 delta_j = nn.functional.normalize(grad_j, p=2, dim=1, eps=self.eps)
#             else:
#                 delta_j = torch.rand(j.size())

#             # Add adversarial perturbation to embeddings, now we have q+delta, p+delta
#             x_ui_adv = torch.mul(u + delta_u, i + delta_i).sum(dim=1)
#             x_uj_adv = torch.mul(u + delta_u, j + delta_j).sum(dim=1)

#             # find difference between pos and neg item, then clip value
#             x_uij_adv = torch.clamp(x_ui_adv - x_uj_adv,min=-80.0,max=1e8)

#             # Calculate APR loss with logsigmoid
#             log_prob = F.logsigmoid(x_uij_adv).sum()
#             adv_loss = self.reg_adv *(-log_prob) + loss # this is adversarial loss (equation 4 in paper)
#             adv_loss.backward()

#             return adv_loss

In [None]:
def fairness_reprogramming(self, u, i, j):
    """Reprogramming phase:
        1.Freeze the user and item embedding -- done by saving checkpoint
        2.Calculate the perturbation to achieve fairness objective
        3.Add perturbation to the alr frozen embedding
        4.Calculate the overall loss function after update
    """

    # Initialize a fix random perturbation
    perturbation = torch.rand(1)
    
    #load user and item embedding, which has been trained in BPR
    u = list(model1.items())[0][1]
    i = list(model1.items())[1][1][i]
    j = list(model1.items())[1][1][j]
        
    # Add adversarial perturbation to embeddings, now we have q+delta, p+delta
    x_ui_adv = torch.mul(u , i + perturbation).sum(dim=1)
    x_uj_adv = torch.mul(u , j + perturbation).sum(dim=1)

    # find difference between pos and neg item, then clip value
    x_uij_adv = torch.clamp(x_ui_adv - x_uj_adv,min=-80.0,max=1e8)

    # Calculate loss with perturbed embedding with logsigmoid
    log_prob = F.logsigmoid(x_uij_adv).sum()
            
    #set up an adversary to identify group of items
    adversary_rs        
            
    # modify the adversarial loss here
    adv_loss = self.reg_adv *(-log_prob) + loss # this is adversarial loss (equation 4 in paper)
    adv_loss.backward()

    return adv_loss

In [68]:
#load the results of BPR
model1 = (torch.load('models/01_pytorch_workflow_model_1.pth'))
list(model1.items())[1][1].size()

torch.Size([3760, 64])

In [94]:
list(model1.items())[0][1]

tensor([[ 9.3768e-04, -4.2642e-02,  2.0659e-02,  ..., -2.2226e-02,
          3.1339e-02, -3.3840e-02],
        [ 1.5390e-04, -3.8164e-03,  1.1938e-02,  ..., -1.9504e-03,
         -5.5167e-03,  2.5199e-02],
        [-1.7836e-03, -3.0050e-02, -3.0792e-02,  ...,  1.1926e-02,
         -1.7736e-02,  6.9641e-03],
        ...,
        [-3.4588e-02, -1.5128e-03, -5.1930e-02,  ...,  3.3526e-02,
         -3.9308e-03,  1.3962e-02],
        [ 1.6954e-05, -1.2272e-02,  3.1606e-02,  ..., -2.2976e-02,
         -1.5850e-03, -2.3360e-02],
        [ 1.0627e-02, -2.0485e-02, -3.4491e-02,  ..., -1.7362e-02,
          3.7945e-03,  7.2581e-03]])

In [128]:
adv = Adversary4Z(input_dim=2, output_dim =6,use_mlp=True)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(adv.parameters(), lr=1e-3)

# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [129]:
# Fit the model
torch.manual_seed(42)

# Set number of epochs
epochs = 100

test_df = pd.read_csv('Data/train_df.csv', sep=',', encoding="utf-8",engine='python',header =None)
test_df = test_df.iloc[:, [3, 5, 6]]

X = torch.tensor(np.array(test_df.iloc[:,[0,2]])).type(torch.float)
Y = torch.tensor(test_df.iloc[:,1].astype('category').cat.codes).type(torch.LongTensor)   


X_blob_train, X_blob_test, y_blob_train, y_blob_test = train_test_split(X,
    Y,
    test_size=0.2,
    random_state=181
)

# Put data to target device
X_blob_train, y_blob_train = X_blob_train.to(device), y_blob_train.to(device)
X_blob_test, y_blob_test = X_blob_test.to(device), y_blob_test.to(device)

for epoch in range(epochs):
    ### Training
    adv.train()

    # 1. Forward pass
    y_logits = adv(X_blob_train) # model outputs raw logits 
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels
    # print(y_logits)
    # 2. Calculate loss and accuracy
    loss = loss_fn(y_logits, y_blob_train) 
    acc = accuracy_fn(y_true=y_blob_train,
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    adv.eval()
    with torch.inference_mode():
      # 1. Forward pass
        test_logits = adv(X_blob_test)
        test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)
      # 2. Calculate test loss and accuracy
        test_loss = loss_fn(test_logits, y_blob_test)
        test_acc = accuracy_fn(y_true=y_blob_test,
                             y_pred=test_pred)

    # Print out what's happening
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Test Loss: {test_loss:.5f}, Test Acc: {test_acc:.2f}%") 

Epoch: 0 | Loss: 1.79241, Acc: 0.00% | Test Loss: 1.74086, Test Acc: 73.26%
Epoch: 10 | Loss: 1.26619, Acc: 75.38% | Test Loss: 1.21143, Test Acc: 75.12%
Epoch: 20 | Loss: 0.78448, Acc: 75.38% | Test Loss: 0.75899, Test Acc: 75.12%
Epoch: 30 | Loss: 0.62760, Acc: 75.38% | Test Loss: 0.62399, Test Acc: 75.12%
Epoch: 40 | Loss: 0.57636, Acc: 75.38% | Test Loss: 0.57793, Test Acc: 75.12%
Epoch: 50 | Loss: 0.56802, Acc: 75.38% | Test Loss: 0.56953, Test Acc: 75.12%
Epoch: 60 | Loss: 0.56411, Acc: 75.38% | Test Loss: 0.56688, Test Acc: 75.12%
Epoch: 70 | Loss: 0.56192, Acc: 75.38% | Test Loss: 0.56456, Test Acc: 75.12%
Epoch: 80 | Loss: 0.56090, Acc: 75.38% | Test Loss: 0.56373, Test Acc: 75.12%
Epoch: 90 | Loss: 0.56030, Acc: 75.38% | Test Loss: 0.56305, Test Acc: 75.12%


In [102]:
y_blob_train.size()

torch.Size([637114])

In [123]:
X_blob_train.view(-1,1).size()

torch.Size([1274228, 1])

In [78]:
#goal = input y and yhat and return the group of the item?

# Build model
class adversary_rs(nn.Module):
    def __init__(self, input_features, output_features, hidden_units=8):
        """Initializes all required hyperparameters for a multi-class classification model.

        Args:
            input_features (int): Number of input features to the model.
            out_features (int): Number of output features of the model
              (how many classes there are).
            hidden_units (int): Number of hidden units between layers, default 8.
        """
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=hidden_units),
            # nn.ReLU(), # <- does our dataset require non-linear layers? (try uncommenting and see if the results change)
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            # nn.ReLU(), # <- does our dataset require non-linear layers? (try uncommenting and see if the results change)
            nn.Linear(in_features=hidden_units, out_features=output_features), # how many classes are there?
        )
    
    def forward(self, x):
        return self.linear_layer_stack(x)

# Create an instance of BlobModel and send it to the target device
mlp = adversary_rs(input_features=1, 
                    output_features=6, 
                    hidden_units=500).to(device)


In [79]:
 # Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
  # Prepare CIFAR-10 dataset
# trainloader = torch.utils.data.DataLoader(X, batch_size=512, shuffle=True, num_workers=1)


In [80]:
mlp

adversary_rs(
  (linear_layer_stack): Sequential(
    (0): Linear(in_features=1, out_features=500, bias=True)
    (1): Linear(in_features=500, out_features=500, bias=True)
    (2): Linear(in_features=500, out_features=6, bias=True)
  )
)

In [153]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [154]:
# Fit the model
torch.manual_seed(42)

# Set number of epochs
epochs = 1000

test_df = pd.read_csv('test_adversary2.dat', sep=',', encoding="utf-8",engine='python')
test_df = test_df[['genres', 'rating']]

X = torch.tensor(test_df['rating'].values).type(torch.float)
Y = torch.tensor(test_df['genres'].astype('category').cat.codes).type(torch.LongTensor)   


X_blob_train, X_blob_test, y_blob_train, y_blob_test = train_test_split(X,
    Y,
    test_size=0.2,
    random_state=181
)

# Put data to target device
X_blob_train, y_blob_train = X_blob_train.to(device).view(-1,1), y_blob_train.to(device)
X_blob_test, y_blob_test = X_blob_test.to(device).view(-1,1), y_blob_test.to(device)

for epoch in range(epochs):
    ### Training
    mlp.train()

    # 1. Forward pass
    y_logits = mlp(X_blob_train) # model outputs raw logits 
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels
    # print(y_logits)
    # 2. Calculate loss and accuracy
    loss = loss_fn(y_logits, y_blob_train) 
    acc = accuracy_fn(y_true=y_blob_train,
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    mlp.eval()
    with torch.inference_mode():
      # 1. Forward pass
        test_logits = mlp(X_blob_test)
        test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)
      # 2. Calculate test loss and accuracy
        test_loss = loss_fn(test_logits, y_blob_test)
        test_acc = accuracy_fn(y_true=y_blob_test,
                             y_pred=test_pred)

    # Print out what's happening
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Test Loss: {test_loss:.5f}, Test Acc: {test_acc:.2f}%") 

Epoch: 0 | Loss: 1.73429, Acc: 40.30% | Test Loss: 1.47605, Test Acc: 44.84%
Epoch: 10 | Loss: 1.42034, Acc: 44.80% | Test Loss: 1.40173, Test Acc: 44.84%
Epoch: 20 | Loss: 1.39443, Acc: 45.17% | Test Loss: 1.38122, Test Acc: 45.13%
Epoch: 30 | Loss: 1.37248, Acc: 45.83% | Test Loss: 1.35702, Test Acc: 46.02%
Epoch: 40 | Loss: 1.35933, Acc: 47.01% | Test Loss: 1.34779, Test Acc: 46.02%
Epoch: 50 | Loss: 1.35134, Acc: 48.19% | Test Loss: 1.34180, Test Acc: 48.67%
Epoch: 60 | Loss: 1.34569, Acc: 47.82% | Test Loss: 1.33921, Test Acc: 48.38%
Epoch: 70 | Loss: 1.34156, Acc: 47.97% | Test Loss: 1.33752, Test Acc: 48.97%
Epoch: 80 | Loss: 1.33857, Acc: 48.12% | Test Loss: 1.33625, Test Acc: 48.67%
Epoch: 90 | Loss: 1.33634, Acc: 48.12% | Test Loss: 1.33610, Test Acc: 48.67%
Epoch: 100 | Loss: 1.33465, Acc: 48.12% | Test Loss: 1.33587, Test Acc: 48.67%
Epoch: 110 | Loss: 1.33335, Acc: 48.12% | Test Loss: 1.33594, Test Acc: 48.67%
Epoch: 120 | Loss: 1.33233, Acc: 48.12% | Test Loss: 1.33612, T

In [64]:
test_df = pd.read_csv('Data/train_df.csv', sep=',', encoding="utf-8",engine='python', header=None)
test_df.iloc[:, 5].groupby(['5']).size().reset_index(name='counts').sort_values(by=['counts'])

KeyError: '5'

In [6]:
test_df = pd.read_csv('test_adversary2.dat', sep=',', encoding="utf-8",engine='python')
test_df.head

<bound method NDFrame.head of       Unnamed: 0  movie_id    genres    rating
0              4         5    Comedy  3.006757
1              8         9    Action  2.656863
2             13        14     Drama  3.542484
3             17        18  Thriller  3.337580
4             18        19    Comedy  2.480720
...          ...       ...       ...       ...
1689        3700      3947  Thriller  3.472727
1690        3701      3948    Comedy  3.635731
1691        3702      3949     Drama  4.115132
1692        3703      3950     Drama  3.666667
1693        3704      3951     Drama  3.900000

[1694 rows x 4 columns]>

In [129]:
X_blob.shape

torch.Size([1000, 2])

In [135]:
torch.unique(Y)

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 1

In [2]:
import pandas as pd


In [29]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']


train = pd.read_csv('Data/ml-1m.train.rating', sep='\t', encoding="utf-8",engine='python',header=None, names=rnames)

train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,32,4,978824330
1,0,34,4,978824330
2,0,4,5,978824291
3,0,35,4,978824291
4,0,30,4,978824291


In [30]:
test = pd.read_csv('Data/ml-1m.test.rating', sep='\t', encoding="utf-8",engine='python',header=None,  names=rnames)
test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,25,5,978824351
1,1,133,3,978300174
2,2,207,4,978298504
3,3,208,4,978294282
4,4,222,2,978246585


In [32]:
train[train['movie_id'] == 402]

Unnamed: 0,user_id,movie_id,rating,timestamp
465,5,402,3,978237813
7628,52,402,4,977979943
8983,61,402,3,977969947
10729,77,402,3,977811982
31983,215,402,1,976864249
...,...,...,...,...
971086,5887,402,3,957481438
975314,5916,402,2,957315233
975787,5921,402,3,957297788
981240,5957,402,2,957062785


In [34]:
test[test['movie_id'] == 402]

Unnamed: 0,user_id,movie_id,rating,timestamp
5596,5596,402,5,959215944
5953,5953,402,1,957707693


In [36]:
train['movie_id'].nunique()

3704