In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import random 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from tqdm import tqdm

In [None]:
config = {'batch_size' : 5096,
          'num_worker' : 8}

In [None]:
train_prod = pd.read_pickle("../data/train_prod_v15.pickle")
test_prod = pd.read_pickle("../data/test_prod_v15.pickle")

print(train_prod.shape, test_prod.shape)

In [None]:
print("Find the age difference")

train_prod['age_difference'] = train_prod['from_age']-train_prod['to_age']
test_prod['age_difference'] = test_prod['from_age']-test_prod['to_age']

In [None]:
train_prod['to_swipe_by_session_percentage'] = (train_prod['to_total_swipe_counts']/ train_prod['to_total_session_count']).replace(np.inf, 0)
train_prod['from_swipe_by_session_percentage'] = (train_prod['from_total_swipe_counts']/ train_prod['to_total_session_count']).replace(np.inf, 0)

test_prod['to_swipe_by_session_percentage'] = (test_prod['to_total_swipe_counts']/ test_prod['to_total_session_count']).replace(np.inf, 0)
test_prod['from_swipe_by_session_percentage'] = (test_prod['from_total_swipe_counts']/ test_prod['to_total_session_count']).replace(np.inf, 0)

train_prod['to_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['to_swipe_left_count']).replace(np.inf, 0)
train_prod['from_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['from_swipe_left_count']).replace(np.inf, 0)

train_prod['to_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['to_swipe_right_count']).replace(np.inf, 0)
train_prod['from_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['from_swipe_right_count']).replace(np.inf, 0)

train_prod['to_overall_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['to_total_swipe_counts']).replace(np.inf, 0)
train_prod['from_overall_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['from_total_swipe_counts']).replace(np.inf, 0)

train_prod['to_overall_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['to_total_swipe_counts']).replace(np.inf, 0)
train_prod['from_overall_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['from_total_swipe_counts']).replace(np.inf, 0)

test_prod['to_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['to_swipe_left_count']).replace(np.inf, 0)
test_prod['from_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['from_swipe_left_count']).replace(np.inf, 0)

test_prod['to_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['to_swipe_right_count']).replace(np.inf, 0)
test_prod['from_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['from_swipe_right_count']).replace(np.inf, 0)

test_prod['to_overall_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['to_total_swipe_counts']).replace(np.inf, 0)
test_prod['from_overall_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['from_total_swipe_counts']).replace(np.inf, 0)

test_prod['to_overall_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['to_total_swipe_counts']).replace(np.inf, 0)
test_prod['from_overall_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['from_total_swipe_counts']).replace(np.inf, 0)

In [None]:
train_prod = train_prod.replace(999999, 0) 
test_prod = test_prod.replace(999999, 0) 
train_prod.fillna(0, inplace=True)
test_prod.fillna(0, inplace=True)

print(train_prod.shape, test_prod.shape)

In [None]:
lgb_bottom_importance = [
                         'from_purpose_id_12',
                         'to_unique_degree_count',
                         'from_purpose_id_3',
                         'from_unique_school_count',
                         'rev_strength_4',
                         'to_unique_school_count',
                         'rev_strength_7',
                         'rev_strength_8',
                         'rev_strength_6',
                         'rev_strength_5']

self_intro_columns = train_prod.columns[train_prod.columns.str.contains("_self_intro_")].tolist()

to_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("to_self_intro_")].tolist()
from_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("from_self_intro_")].tolist()

purpose_columns = train_prod.columns[train_prod.columns.str.contains("_purpose_id_")].tolist()
rev_strength_columns = train_prod.columns[train_prod.columns.str.contains("rev_strength")].tolist()
review_comments = train_prod.columns[train_prod.columns.str.contains("_review_comments_")].tolist()

others = ['to_review_comments_count', 'from_review_comments_count', 'to_last_login_year']

In [None]:
dep = 'score'
drop = ['from-to', 'user_purpose_cosine_similarity', 'to_last_swipe_year']  + review_comments + rev_strength_columns # + from_self_intro_columns
indep = train_prod.columns.difference([dep]+drop)

print("Indep length:",len(indep))
print("Columns that are dropped:", drop)

In [None]:
# Scaling the features

scaler = MinMaxScaler()
scaler.fit(train_prod[indep])

train_prod[indep] = scaler.transform(train_prod[indep])
test_prod[indep] = scaler.transform(test_prod[indep])

train_prod.describe()

In [None]:
np.random.seed(100)
train_local_X, test_local_X, train_local_Y, test_local_Y = train_test_split(train_prod[indep],
                                                                            train_prod[dep], 
                                                                            test_size=0.2,
                                                                            stratify=train_prod[dep])

train_local_X, test_local_X, train_local_Y, test_local_Y = train_local_X.reset_index(drop=True), test_local_X.reset_index(drop=True), train_local_Y.reset_index(drop=True), test_local_Y.reset_index(drop=True)
print(train_local_X.shape, train_local_Y.shape, test_local_X.shape, test_local_Y.shape)

In [None]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_features, num_labels):
        super(MulticlassClassification, self).__init__()
        
        self.num_features = num_features
        self.num_labels = num_labels
        
        self.hidden_layer_1 = nn.Linear(self.num_features, 2056)
        self.hidden_layer_2 = nn.Linear(2056, 2056)
        self.output = nn.Linear(2056, self.num_labels)
        
        self.relu = nn.ReLU()
        self.batch_norm_1 = nn.BatchNorm1d(2056)
        self.batch_norm_2 = nn.BatchNorm1d(2056)

        
    def forward(self, X):
        out = self.hidden_layer_1(X)
        out = self.batch_norm_1(out)
        out = self.relu(out)

        out = self.hidden_layer_2(out)
        out = self.batch_norm_2(out)
        out = self.relu(out)
                
        out = self.hidden_layer_2(out)
        out = self.batch_norm_2(out)
        out = self.relu(out)
                
        out = self.hidden_layer_2(out)
        out = self.batch_norm_2(out)
        out = self.relu(out)
                
        out = self.hidden_layer_2(out)
        out = self.batch_norm_2(out)
        out = self.relu(out)
        
                
        out = self.hidden_layer_2(out)
        out = self.batch_norm_2(out)
        out = self.relu(out)
                
        out = self.hidden_layer_2(out)
        out = self.batch_norm_2(out)
        out = self.relu(out)
        
        out = self.output(out)
        #out = nn.Softmax()(out)
        
        return out
        
        

In [None]:
class FormInputs():
    def __init__(self, features, label, datatype='train'):
        self.datatype=datatype
        self.features = features
        self.label = label
    
    def __len__(self):
        return self.features.shape[0]
        
    def __getitem__(self, index):
        X = self.features.loc[index, :]
        
        if self.datatype=='test':
            Y = 1
        else:
            Y = self.label[index]
        
        return {'features' : torch.tensor(X, dtype=torch.float),
                'target' : torch.tensor(Y, dtype=torch.long)}

In [None]:
train_local_input = FormInputs(features=train_local_X, label=train_local_Y, datatype='train')
test_local_input = FormInputs(features=test_local_X, label=test_local_Y, datatype='train')
train_prod_input = FormInputs(features=train_prod[indep], label=train_prod[dep], datatype='train')
test_prod_input = FormInputs(features=test_prod[indep], label=None, datatype='test')

test_prod_input[100]

In [None]:
# Pushing the data to data loader
train_local_data_loader = torch.utils.data.DataLoader(train_local_input,
                                                      shuffle=True,
                                                      batch_size=config['batch_size'],
                                                      num_workers=config['num_worker'])
test_local_data_loader = torch.utils.data.DataLoader(test_local_input,
                                                     shuffle=True,
                                                     batch_size=config['batch_size'],
                                                     num_workers=config['num_worker'])
train_prod_data_loader = torch.utils.data.DataLoader(train_prod_input,
                                                     shuffle=True,
                                                     batch_size=config['batch_size'],
                                                     num_workers=config['num_worker'])
test_prod_data_loader = torch.utils.data.DataLoader(test_prod_input,
                                                    shuffle=True,
                                                    batch_size=config['batch_size'],
                                                    num_workers=config['num_worker'])

In [None]:
loss_func = nn.CrossEntropyLoss()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
def setting_seed(seed_no):
    random.seed(seed_no)
    np.random.seed(seed_no)
    torch.manual_seed(seed_no)
    torch.cuda.manual_seed_all(seed_no)

In [None]:
def train_fn(model, optimizer, data_loader):
    
    model.train()
    
    final_loss = 0
    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        feature = data['features'].to(device)
        target = data['target'].to(device)
        
        prediction = model(feature)
        
        prediction_soft_max = nn.Softmax(dim=1)(prediction)
        prediction_soft_max = prediction_soft_max.argmax(axis=1)
        #print("train_actual:", np.unique(target.detach().cpu().numpy(), return_counts=True))
        #print("train_predicted:", np.unique(prediction_soft_max.detach().cpu().numpy(), return_counts=True))
        
        loss = loss_func(prediction, target)
        loss.sum().backward()
        optimizer.step()
        optimizer.zero_grad()
        
        final_loss += loss
    
    final_loss = final_loss/len(data_loader)
    
    return model, final_loss
    

In [None]:
def eval_fn(model, data_loader):
    
    model.eval()
    
    final_loss = 0
    actual_output=[]
    predicted_output=[]
    with torch.no_grad():
        for i, data in tqdm(enumerate(data_loader), total = len(data_loader)):
            feature = data['features'].to(device)
            target = data['target'].to(device)

            prediction = model(feature)
                        
            loss = loss_func(prediction, target)
            final_loss += loss
            
            #print("validation_prediction:", prediction)
            
            prediction = nn.Softmax(dim=1)(prediction)
            
            #print("softmax_prediction:", prediction)
            
            prediction = prediction.argmax(axis=1)
            
            #print("Argmax_prediction:", prediction)
            
            predicted_output.extend(prediction.detach().cpu().numpy().tolist())
            actual_output.extend(target.detach().cpu().numpy().tolist())
                        
        print("Predicted output:", np.unique(predicted_output, return_counts=True))
        print("Actual output:", np.unique(actual_output, return_counts=True))
        print(confusion_matrix(y_true=actual_output, y_pred=predicted_output))
        
        final_loss = final_loss/len(data_loader)
        accuracy = accuracy_score(predicted_output, actual_output)
    
    return final_loss, actual_output, predicted_output, accuracy       
    

In [None]:
def test_fn(model, data_loader):
    
    model.eval()
    
    predicted_output=[]
    with torch.no_grad():
        for i, data in tqdm(enumerate(data_loader), total = len(data_loader)):
            feature = data['features'].to(device)
            
            prediction = model(feature)
                        
            prediction = nn.Softmax(dim=1)(prediction)
            prediction = prediction.argmax(axis=1)
            
            predicted_output.extend(prediction.detach().cpu().numpy().tolist())
                        
    return predicted_output
    

In [None]:
def train_engine(epochs, train_data, eval_data, patience):
    
    setting_seed(seed_no=100)
    model = MulticlassClassification(num_features=len(indep), num_labels=4)

    model = nn.DataParallel(model)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
        
    counter = 0
    best_accuracy = 0
    for epoch in range(epochs):
        model, train_loss = train_fn(model, optimizer, data_loader=train_data)
        eval_loss, eval_actual, eval_prediction, accuracy = eval_fn(model, data_loader=eval_data)
        
        print("Epoch: {} train loss: {} eval loss: {} eval accuracy {}".format(epoch, train_loss, eval_loss, accuracy))
        
        if accuracy > best_accuracy:  
        
            best_accuracy = accuracy
            counter = 0

            model_path = '../saved_model/best_model_1.bin'            
            print("Saving the model:", model_path)
            torch.save(model, model_path)
            
        else:
            counter += 1
            print("Patience:", counter)
            
            if counter == patience:
                print("Reached the patience threshold so ending the training")
                break            
        
        print("Best Accuracy:", best_accuracy)
        
    return model, eval_actual, eval_prediction


In [None]:
model, eval_actual, eval_prediction = train_engine(epochs = 500, 
                                                   train_data=train_local_data_loader, 
                                                   eval_data=test_local_data_loader,
                                                   patience=10)

In [None]:
state_save_path = '../saved_model/saved_state'
loaded_state = torch.load(state_save_path)

model = MulticlassClassification(num_features=len(indep), num_labels=4)
model = nn.DataParallel(model)

model_weight = model.load_state_dict(loaded_state['model_state_dict'])
model_weight