In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import random 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from tqdm import tqdm

from network import RNN_for_Text, NN_for_num_Features, MulticlassClassification

In [2]:
config = {'batch_size' : 5096,
          'num_worker' : 8}

In [3]:
train_prod = pd.read_pickle("../data/train_prod_v14.pickle")
test_prod = pd.read_pickle("../data/test_prod_v14.pickle")

print(train_prod.shape, test_prod.shape)

(903605, 785) (387975, 784)


In [None]:
print("Find the age difference")

train_prod['age_difference'] = train_prod['from_age']-train_prod['to_age']
test_prod['age_difference'] = test_prod['from_age']-test_prod['to_age']

train_prod['to_swipe_by_session_percentage'] = (train_prod['to_total_swipe_counts']/ train_prod['to_total_session_count']).replace(np.inf, 0)
train_prod['from_swipe_by_session_percentage'] = (train_prod['from_total_swipe_counts']/ train_prod['to_total_session_count']).replace(np.inf, 0)

test_prod['to_swipe_by_session_percentage'] = (test_prod['to_total_swipe_counts']/ test_prod['to_total_session_count']).replace(np.inf, 0)
test_prod['from_swipe_by_session_percentage'] = (test_prod['from_total_swipe_counts']/ test_prod['to_total_session_count']).replace(np.inf, 0)

train_prod['to_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['to_swipe_left_count']).replace(np.inf, 0)
train_prod['from_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['from_swipe_left_count']).replace(np.inf, 0)

train_prod['to_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['to_swipe_right_count']).replace(np.inf, 0)
train_prod['from_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['from_swipe_right_count']).replace(np.inf, 0)

train_prod['to_overall_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['to_total_swipe_counts']).replace(np.inf, 0)
train_prod['from_overall_common_users_left_swipe_percentage'] = (train_prod['common_users_swiped_left']/train_prod['from_total_swipe_counts']).replace(np.inf, 0)

train_prod['to_overall_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['to_total_swipe_counts']).replace(np.inf, 0)
train_prod['from_overall_common_users_right_swipe_percentage'] = (train_prod['common_users_swiped_right']/train_prod['from_total_swipe_counts']).replace(np.inf, 0)

test_prod['to_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['to_swipe_left_count']).replace(np.inf, 0)
test_prod['from_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['from_swipe_left_count']).replace(np.inf, 0)

test_prod['to_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['to_swipe_right_count']).replace(np.inf, 0)
test_prod['from_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['from_swipe_right_count']).replace(np.inf, 0)

test_prod['to_overall_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['to_total_swipe_counts']).replace(np.inf, 0)
test_prod['from_overall_common_users_left_swipe_percentage'] = (test_prod['common_users_swiped_left']/test_prod['from_total_swipe_counts']).replace(np.inf, 0)

test_prod['to_overall_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['to_total_swipe_counts']).replace(np.inf, 0)
test_prod['from_overall_common_users_right_swipe_percentage'] = (test_prod['common_users_swiped_right']/test_prod['from_total_swipe_counts']).replace(np.inf, 0)

In [7]:
train_prod = train_prod.replace(999999, 0) 
test_prod = test_prod.replace(999999, 0) 
train_prod.fillna(0, inplace=True)
test_prod.fillna(0, inplace=True)

print(train_prod.shape, test_prod.shape)

(903605, 796) (387975, 795)


In [8]:
lgb_bottom_importance = [
                         'from_purpose_id_12',
                         'to_unique_degree_count',
                         'from_purpose_id_3',
                         'from_unique_school_count',
                         'rev_strength_4',
                         'to_unique_school_count',
                         'rev_strength_7',
                         'rev_strength_8',
                         'rev_strength_6',
                         'rev_strength_5']

to_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("to_self_intro_")].tolist()
from_self_intro_columns = train_prod.columns[train_prod.columns.str.contains("from_self_intro_")].tolist()
review_columns = train_prod.columns[train_prod.columns.str.contains("rev_strength")].tolist()

id_feature = ['from-to']
dep = ['score']
drop = ['user_purpose_cosine_similarity', 'to_last_swipe_year'] + review_columns

text_embed_columns = train_prod.columns[train_prod.columns.str.contains("_self_intro_")].tolist()
num_feature_columns = train_prod.columns.difference(text_embed_columns + id_feature + dep + drop ).tolist()
indep = id_feature + text_embed_columns + num_feature_columns

print(f"Numerical features: {len(num_feature_columns)}")
print(f"Text features: {len(text_embed_columns)}")
print(f"Total indep features {len(indep)}")

Numerical features: 184
Text features: 600
Total indep features 785


In [None]:
scaler = MinMaxScaler()
scaler.fit(train_prod[num_feature_columns])

train_prod[num_feature_columns] = scaler.transform(train_prod[num_feature_columns])
test_prod[num_feature_columns] = scaler.transform(test_prod[num_feature_columns])

train_prod.describe()

In [11]:
class FormInputs():
    def __init__(self, features, label, id_feature, num_features, intro_features, datatype='train'):
        self.features = features
        self.label = label
        self.id_feature=id_feature
        self.num_columns = num_features
        self.intro_columns = intro_features
        self.datatype=datatype
        
    def __len__(self):
        return self.features.shape[0]
        
    def __getitem__(self, index):
        id_feature = self.features.loc[index, self.id_feature]
        num_features = self.features.loc[index, self.num_columns]
        intro_features = self.features.loc[index, self.intro_columns]
        
        if self.datatype=='test':
            Y = 1
        else:
            Y = self.label.loc[index, :]
            
        return {'id_feature':id_feature.values.tolist(), 
                'num_features': torch.tensor(num_features, dtype=torch.float),
                'intro_features': torch.tensor(intro_features, dtype=torch.float),
                'target' : torch.tensor(Y, dtype=torch.long)}

In [None]:
train_prod_input = FormInputs(features=train_prod[indep], label=train_prod[dep],
                              id_feature=id_feature,
                              num_features=num_feature_columns,
                              intro_features=text_embed_columns,
                              datatype='train')

test_prod_input = FormInputs(features=test_prod[indep], label=None,
                             id_feature=id_feature,
                             num_features=num_feature_columns,
                             intro_features=text_embed_columns,
                             datatype='test')

train_prod_input[0]

In [13]:
train_prod_data_loader = torch.utils.data.DataLoader(train_prod_input,
                                                     shuffle=True,
                                                     batch_size=config['batch_size'],
                                                     num_workers=config['num_worker'])
test_prod_data_loader = torch.utils.data.DataLoader(test_prod_input,
                                                    shuffle=True,
                                                    batch_size=config['batch_size'],
                                                    num_workers=config['num_worker'])

In [14]:
loss_func = nn.CrossEntropyLoss()

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [16]:
def setting_seed(seed_no):
    random.seed(seed_no)
    np.random.seed(seed_no)
    torch.manual_seed(seed_no)
    torch.cuda.manual_seed_all(seed_no)

In [17]:
text_hidden_dim=600
text_output_dim=100

num_hidden_dim=184
num_output_dim=50

text_features = len(text_embed_columns)
numeric_features = len(num_feature_columns)

def get_features(data_loader):
    '''
    In this function we pass the text features embedding through the RNN network to get the final embeddings.
    Numerical features are passed through the dense layer to get the final embeddings.
    Both these embedding are concatenated together returned as output.
    '''
    
    
    RNN_model = RNN_for_Text(embedding_dim=text_features, hidden_dim=text_hidden_dim, output_dim=text_output_dim).to(device)
    Num_model = NN_for_num_Features(num_features=numeric_features, hidden_dim=num_hidden_dim, output_dim=num_output_dim).to(device)
    
    id_col = np.array([])
    target_col = np.array([])
    text_embed_output = torch.tensor([])
    num_embed_output = torch.tensor([])
    
    with torch.no_grad():
        for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):

            id_features = data['id_feature']
            num_features = data['num_features'].to(device)
            intro_features = data['intro_features'].to(device)
            label = data['target'].to(device)
            
            # Passing to the RNN model
            temp_text_embed = RNN_model(intro_features)
            temp_text_embed = temp_text_embed.view(-1, text_output_dim)
            text_embed_output = torch.cat([text_embed_output, temp_text_embed])

            # Passing into a linear Dense NN
            temp_num_embed = Num_model(num_features)
            num_embed_output = torch.cat([num_embed_output, temp_num_embed])

            id_col = np.append(id_col, id_features)
            target_col = np.append(target_col, label) 
            
    return id_col, target_col, text_embed_output, num_embed_output

train_id_col, train_target_col, train_prod_text_embed_output, train_prod_num_embed_output = get_features(data_loader=train_prod_data_loader)
test_id_col, test_target_col, test_prod_text_embed_output, test_prod_num_embed_output = get_features(data_loader=test_prod_data_loader)

100%|██████████| 178/178 [08:07<00:00,  2.74s/it]
100%|██████████| 77/77 [03:22<00:00,  2.63s/it]


In [18]:
train_prod_text_num_concat = torch.cat([train_prod_text_embed_output, train_prod_num_embed_output], axis=1)
test_prod_text_num_concat = torch.cat([test_prod_text_embed_output, test_prod_num_embed_output], axis=1)

final_features = train_prod_text_num_concat.shape[1]

print(f"Train ID features:{train_id_col.shape}")
print(f"Train target features:{train_target_col.shape}")
print(f"Train Text features:{train_prod_text_embed_output.shape}")
print(f"Train Num features:{train_prod_num_embed_output.shape}")
print(f"Train text and num features:{train_prod_text_num_concat.shape}")
print("")
print(f"Test ID features:{test_id_col.shape}")
print(f"Test target features:{test_target_col.shape}")
print(f"Test Text features:{test_prod_text_embed_output.shape}")
print(f"Test Num features:{test_prod_num_embed_output.shape}")
print(f"Test text and num features:{test_prod_text_num_concat.shape}")



Train ID features:(903605,)
Train target features:(903605,)
Train Text features:torch.Size([903605, 100])
Train Num features:torch.Size([903605, 50])
Train text and num features:torch.Size([903605, 150])

Test ID features:(387975,)
Test target features:(387975,)
Test Text features:torch.Size([387975, 100])
Test Num features:torch.Size([387975, 50])
Test text and num features:torch.Size([387975, 150])


In [None]:
train_set = np.concatenate((train_id_col.reshape(-1, 1), train_target_col.reshape(-1, 1), train_prod_text_embed_output, train_prod_num_embed_output), axis=1)
test_set = np.concatenate((test_id_col.reshape(-1, 1), test_prod_text_embed_output, test_prod_num_embed_output), axis=1)

In [None]:
text_col = ["text_embed_"+str(i) for i in range(text_output_dim)]
num_col = ["num_embed_"+str(i) for i in range(num_output_dim)]

train_set = pd.DataFrame(train_set)
train_set.columns = ['from-to']+['score']+text_col+num_col

test_set = pd.DataFrame(test_set)
test_set.columns = ['from-to']+text_col+num_col

train_set[['score']+text_col+num_col] = train_set[['score']+text_col+num_col].astype('float')
test_set[text_col+num_col] = test_set[text_col+num_col].astype('float')

train_set

In [26]:
dep = 'score'
id_feature = ['from-to']
indep = train_set.columns.difference(id_feature+[dep])
indep

Index(['num_embed_0', 'num_embed_1', 'num_embed_10', 'num_embed_11',
       'num_embed_12', 'num_embed_13', 'num_embed_14', 'num_embed_15',
       'num_embed_16', 'num_embed_17', 'num_embed_18', 'num_embed_19',
       'num_embed_2', 'num_embed_20', 'num_embed_21', 'num_embed_22',
       'num_embed_23', 'num_embed_24', 'num_embed_25', 'num_embed_26',
       'num_embed_27', 'num_embed_28', 'num_embed_29', 'num_embed_3',
       'num_embed_30', 'num_embed_31', 'num_embed_32', 'num_embed_33',
       'num_embed_34', 'num_embed_35', 'num_embed_36', 'num_embed_37',
       'num_embed_38', 'num_embed_39', 'num_embed_4', 'num_embed_40',
       'num_embed_41', 'num_embed_42', 'num_embed_43', 'num_embed_44',
       'num_embed_45', 'num_embed_46', 'num_embed_47', 'num_embed_48',
       'num_embed_49', 'num_embed_5', 'num_embed_6', 'num_embed_7',
       'num_embed_8', 'num_embed_9', 'text_embed_0', 'text_embed_1',
       'text_embed_10', 'text_embed_11', 'text_embed_12', 'text_embed_13',
       'text

# The output from the above methods are passed as input to the final Neural network layers in the subsequent step

In [27]:
np.random.seed(100)
train_local_X, test_local_X, train_local_Y, test_local_Y = train_test_split(train_set[indep],
                                                                            train_set[dep], 
                                                                            test_size=0.2,
                                                                            stratify=train_set[dep])

train_local_X, test_local_X, train_local_Y, test_local_Y = train_local_X.reset_index(drop=True), test_local_X.reset_index(drop=True), train_local_Y.reset_index(drop=True), test_local_Y.reset_index(drop=True)

print(train_local_X.shape, train_local_Y.shape, test_local_X.shape, test_local_Y.shape)

(722884, 70) (722884,) (180721, 70) (180721,)


In [29]:
class FormInputs_v2():
    def __init__(self, features, label, id_feature, datatype='train'):
        self.features = features
        self.label = label
        self.id_feature=id_feature
        self.datatype=datatype
        
    def __len__(self):
        return self.features.shape[0]
        
    def __getitem__(self, index):
        X = self.features.loc[index, :]
        #id_feature = self.features.loc[index, self.id_feature]
        
        if self.datatype=='test':
            Y = 1
        else:
            Y = self.label[index]
            
        return {'features':torch.tensor(X, dtype=torch.float), 
                'target' : torch.tensor(Y, dtype=torch.long)}

In [31]:
train_set_local_input = FormInputs_v2(features=train_local_X, label=train_local_Y,
                                      id_feature=id_feature,
                                      datatype='train')

test_set_local_input = FormInputs_v2(features=test_local_X, label=test_local_Y,
                                     id_feature=id_feature,
                                     datatype='train')

train_set_prod_input = FormInputs_v2(features=train_set[indep], label=train_set[dep],
                                     id_feature=id_feature,
                                     datatype='train')

test_set_prod_input = FormInputs_v2(features=test_set[indep], label=None,
                                    id_feature=id_feature,
                                    datatype='test')

train_set_prod_input[0]

{'features': tensor([-0.1316, -0.2858, -0.0910, -0.1259, -0.1682, -0.2966, -0.1103,  0.0168,
         -0.1502,  0.1775, -0.3764, -0.0737, -0.0307, -0.4879,  0.0754, -0.4804,
         -0.3590,  0.1910,  0.0871,  0.2194,  0.0723, -0.1846,  0.2590,  0.3882,
         -0.0378,  0.0484, -0.2049, -0.0090, -0.0235, -0.1203, -0.0377,  0.0665,
         -0.1807,  0.1256, -0.1030,  0.0661, -0.4802, -0.0856,  0.3613,  0.1793,
          0.1104,  0.2613, -0.1044, -0.1081, -0.0211, -0.5167,  0.0357,  0.0074,
         -0.1306, -0.3530, -0.2787,  0.0880,  0.0470, -0.3388,  1.0979, -0.3980,
          0.5047, -0.2246, -0.6184,  0.1844,  0.3295, -0.5399,  0.9563,  0.6263,
         -0.2246, -0.9370, -0.0734, -0.3513,  0.7549, -1.3140]),
 'target': tensor(1)}

In [32]:
train_set_local_data_loader = torch.utils.data.DataLoader(train_set_local_input,
                                                      shuffle=True,
                                                      batch_size=config['batch_size'],
                                                      num_workers=config['num_worker'])
test_set_local_data_loader = torch.utils.data.DataLoader(test_set_local_input,
                                                     shuffle=True,
                                                     batch_size=config['batch_size'],
                                                     num_workers=config['num_worker'])
train_set_prod_data_loader = torch.utils.data.DataLoader(train_set_prod_input,
                                                     shuffle=True,
                                                     batch_size=config['batch_size'],
                                                     num_workers=config['num_worker'])
test_set_prod_data_loader = torch.utils.data.DataLoader(test_set_prod_input,
                                                    shuffle=True,
                                                    batch_size=config['batch_size'],
                                                    num_workers=config['num_worker'])

In [33]:
def train_fn(model, optimizer, data_loader):
    
    model.train()
    
    final_loss = 0
    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        feature = data['features'].to(device)
        target = data['target'].to(device)
        
        prediction = model(feature)
        
        prediction_soft_max = nn.Softmax(dim=1)(prediction)
        prediction_soft_max = prediction_soft_max.argmax(axis=1)
        #print("train_actual:", np.unique(target.detach().cpu().numpy(), return_counts=True))
        #print("train_predicted:", np.unique(prediction_soft_max.detach().cpu().numpy(), return_counts=True))
        
        loss = loss_func(prediction, target)
        loss.sum().backward()
        optimizer.step()
        optimizer.zero_grad()
        
        final_loss += loss
    
    final_loss = final_loss/len(data_loader)
    
    return model, final_loss
    

In [34]:
def eval_fn(model, data_loader):
    
    model.eval()
    
    final_loss = 0
    actual_output=[]
    predicted_output=[]
    with torch.no_grad():
        for i, data in tqdm(enumerate(data_loader), total = len(data_loader)):
            feature = data['features'].to(device)
            target = data['target'].to(device)
            
            prediction = model(feature)
                        
            loss = loss_func(prediction, target)
            final_loss += loss
            
            #print("validation_prediction:", prediction)
            
            prediction = nn.Softmax(dim=1)(prediction)
            
            #print("softmax_prediction:", prediction)
            
            prediction = prediction.argmax(axis=1)
            
            #print("Argmax_prediction:", prediction)
            
            predicted_output.extend(prediction.detach().cpu().numpy().tolist())
            actual_output.extend(target.detach().cpu().numpy().tolist())
                        
        print("Predicted output:", np.unique(predicted_output, return_counts=True))
        print("Actual output:", np.unique(actual_output, return_counts=True))
        print(confusion_matrix(y_true=actual_output, y_pred=predicted_output))
        
        final_loss = final_loss/len(data_loader)
        accuracy = accuracy_score(predicted_output, actual_output)
    
    return final_loss, actual_output, predicted_output, accuracy       
    

In [35]:
def test_fn(model, data_loader):
    
    model.eval()
    
    predicted_output=[]
    with torch.no_grad():
        for i, data in tqdm(enumerate(data_loader), total = len(data_loader)):
            feature = data['features'].to(device)
            
            prediction = model(feature)
                        
            prediction = nn.Softmax(dim=1)(prediction)
            prediction = prediction.argmax(axis=1)
            
            predicted_output.extend(prediction.detach().cpu().numpy().tolist())
                        
    return predicted_output
    

In [36]:
def train_engine(epochs, train_data, eval_data, patience):
    
    setting_seed(seed_no=100)
    model = MulticlassClassification(num_features=final_features, num_labels=4)

    model = nn.DataParallel(model)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
        
    counter = 0
    best_accuracy = 0
    for epoch in range(epochs):
        model, train_loss = train_fn(model, optimizer, data_loader=train_data)
        eval_loss, eval_actual, eval_prediction, accuracy = eval_fn(model, data_loader=eval_data)
        
        print("Epoch: {} train loss: {} eval loss: {} eval accuracy {}".format(epoch, train_loss, eval_loss, accuracy))
        
        if accuracy > best_accuracy:  
        
            best_accuracy = accuracy
            counter = 0

            model_path = '../saved_model/best_model_1.bin'            
            print("Saving the model:", model_path)
            torch.save(model, model_path)
            
        else:
            counter += 1
            print("Patience:", counter)
            
            if counter == patience:
                print("Reached the patience threshold so ending the training")
                break            
        
        print("Best Accuracy:", best_accuracy)
        
    return model, eval_actual, eval_prediction


In [None]:
model, eval_actual, eval_prediction = train_engine(epochs = 500, 
                                                   train_data=train_set_local_data_loader, 
                                                   eval_data=test_set_local_data_loader,
                                                   patience=10)

In [101]:
state_save_path = '../saved_model/saved_state'
loaded_state = torch.load(state_save_path)

model = MulticlassClassification(num_features=len(indep), num_labels=4)
model = nn.DataParallel(model)

model_weight = model.load_state_dict(loaded_state['model_state_dict'])
model_weight

<All keys matched successfully>