In [1]:
import os
import random
import numpy as np
import pickle as pk
import pandas as pd
from tqdm import tqdm
import networkx as nx
from scipy.sparse.linalg import svds

In [2]:
if os.path.isfile('data/train_orig.csv'):
    train_graph=nx.read_edgelist('data/train_orig.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
    print(nx.info(train_graph))
else:
    print("Inorder to run this project, please follow the bello steps")
    print('Step 1: Run EDA.ipynb, atleast the first 3 cells if not full')
    print('Step 2: Run full SplitData.ipynb or search the data folder for the required files')

DiGraph with 1780938 nodes and 7550015 edges


##### Similarty measure used: Jaqard and Cosine
$$
  Jaquard Distance = \frac{|X \cap Y|}{|X \cup Y|}\
$$

In [3]:
#helper function to calculate jaqard distance for followees
def followees_jaqScore(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
    except:
        return 0
    return sim

#example test case
print(followees_jaqScore(273084,1505602))
print(followees_jaqScore(273084,1505602))

0.0
0.0


In [4]:
#helper function to calculate jaqard distance for followers
def followers_jaqScore(a,b):
    try:
        if len(set(train_graph.predecessors(a))) == 0  | len(set(graph.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                 (len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
        return sim
    except:
        return 0

#example test case
print(followers_jaqScore(273084,470294))
print(followees_jaqScore(669354,1635354))

0
0.0


##### Cosine Similarity 
$$
    Cosine Similarity = \frac{|X \cap Y|}{|X|\cdot|Y|}\
$$

In [5]:
#helper function to calculate cosine similarity for followers

def followees_cosScore(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
        return sim
    except:
        return 0

#example test case
print(followees_cosScore(273084,1505602))
print(followees_cosScore(273084,1635354))

0
0


In [6]:
#for followees
def preferential_followee(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        preferential = (len(set(train_graph.successors(a)))*len((set(train_graph.successors(b)))))
                                    
        return preferential
    except:
        return 0
    
print(preferential_followee(273083,1505602))

20


In [7]:
def preferential_followers(a,b):
    try:
        
        if len(set(train_graph.predecessors(a))) == 0  | len(set(train_graph.predecessors(b))) == 0:
            return 0
        preferential = (len(set(train_graph.predecessors(a)))*len(set(train_graph.predecessors(b))))
                                     
        return preferential
    except:
        return 0
    
print(preferential_followers(273084,1505602))

91


In [8]:
def followers_cosScore(a,b):
    try:
        
        if len(set(train_graph.predecessors(a))) == 0  | len(set(train_graph.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                     (math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b)))))
        return sim
    except:
        return 0

#example test case
print(followers_cosScore(2,470294))
print(followers_cosScore(669354,1635354))

0
0


##### Ranking Measures: Page Rank

In [9]:
#Calculate page rank
if not os.path.isfile('data/page_rank.p'):
    pr = nx.pagerank(train_graph, alpha=0.85)
    pk.dump(pr,open('data/page_rank.p','wb'))
else:
    pr = pk.load(open('data/page_rank.p','rb'))

print('min',pr[min(pr, key=pr.get)])
print('max',pr[max(pr, key=pr.get)])
print('mean',float(sum(pr.values())) / len(pr))

#for inputing to nodes which are not there in Train data
mean_pr = float(sum(pr.values())) / len(pr)
print(mean_pr)

min 1.657890853286788e-07
max 2.799140908965827e-05
mean 5.61501860254627e-07
5.61501860254627e-07


In [10]:
# helper function to compute shortest path:
def compute_shortestPath(a,b):
    p=-1
    try:
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p= nx.shortestPath(train_graph,source=a,target=b)
            train_graph.add_edge(a,b)
        else:
            p= nx.shortestPath(train_graph,source=a,target=b)
        return p
    except:
        return -1

#test example cases
compute_shortestPath(77697, 826021)
compute_shortestPath(669354,1635354)

-1

In [11]:
#getting weekly connected edges from graph 
weak_connec=list(nx.weakly_connected_components(train_graph))

def belongs_toWeakConnec(a,b):
    index = []
    if train_graph.has_edge(b,a):
        return 1
    if train_graph.has_edge(a,b):
            for i in weak_connec:
                if a in i:
                    index= i
                    break
            if (b in index):
                train_graph.remove_edge(a,b)
                if compute_shortestPath(a,b)==-1:
                    train_graph.add_edge(a,b)
                    return 0
                else:
                    train_graph.add_edge(a,b)
                    return 1
            else:
                return 0
    else:
            for i in weak_connec:
                if a in i:
                    index= i
                    break
            if(b in index):
                return 1
            else:
                return 0

#example test cases
belongs_toWeakConnec(861, 1659750)
belongs_toWeakConnec(669354,1635354)

0

In [12]:
def adar_index(a,b):
    val=0
    try:
        common=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(common)!=0:
            for i in common:
                val+=(1/np.log10(len(list(train_graph.predecessors(i)))))
            return val
        else:
            return 0
    except:
        return 0

#example test case
adar_index(1,189226)
adar_index(669354,1635354)

0

In [13]:
def follows_back(a,b):
    if train_graph.has_edge(b,a):
        return 1
    else:
        return 0

follows_back(1,189226)
follows_back(669354,1635354)

0

In [14]:
def compute_features_stage1(df_final):
    num_followers_s=[]
    num_followees_s=[]
    num_followers_d=[]
    num_followees_d=[]
    inter_followers=[]
    inter_followees=[]
    for i,row in df_final.iterrows():
        try:
            s1=set(train_graph.predecessors(row['source_node']))
            s2=set(train_graph.successors(row['source_node']))
        except:
            s1 = set()
            s2 = set()
        try:
            d1=set(train_graph.predecessors(row['destination_node']))
            d2=set(train_graph.successors(row['destination_node']))
        except:
            d1 = set()
            d2 = set()
        num_followers_s.append(len(s1))
        num_followees_s.append(len(s2))

        num_followers_d.append(len(d1))
        num_followees_d.append(len(d2))

        inter_followers.append(len(s1.intersection(d1)))
        inter_followees.append(len(s2.intersection(d2)))
    
    return num_followers_s, num_followers_d, num_followees_s, num_followees_d, inter_followers, inter_followees

In [15]:
def svd(x, S,dictionary):
    try:
        z = dictionary[x]
        return S[z]
    except:
        return [0,0,0,0,0,0]

In [16]:
# Katz Centrality:
if not os.path.isfile('data/katz.p'):
    katz = nx.katz.katz_centrality(train_graph,alpha=0.005,beta=1)
    pk.dump(katz,open('data/katz.p','wb'))
else:
    katz = pk.load(open('data/katz.p','rb'))

print('min',katz[min(katz, key=katz.get)])
print('max',katz[max(katz, key=katz.get)])
print('mean',float(sum(katz.values())) / len(katz))

mean_katz = float(sum(katz.values())) / len(katz)
print(mean_katz)

min 0.0007313184369994846
max 0.0033831594632843507
mean 0.0007483399997557072
0.0007483399997557072


In [17]:
# Hits Score

if not os.path.isfile('data/hits.p'):
    hits = nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)
    pk.dump(hits,open('data/hits.p','wb'))
else:
    hits = pk.load(open('data/hits.p','rb'))

print('min',hits[0][min(hits[0], key=hits[0].get)])
print('max',hits[0][max(hits[0], key=hits[0].get)])
print('mean',float(sum(hits[0].values())) / len(hits[0]))

min -9.967983755663483e-21
max 0.004838592143483775
mean 5.61501860252738e-07


In [18]:
if os.path.isfile('data/train_orig.csv'):
    filename = "data/train_orig.csv"
    n_train =  15100028
    s = 100000
    skip_train = sorted(random.sample(range(1,n_train+1),n_train-s))
    
if os.path.isfile('data/test_orig.csv'):
    filename = "data/test_orig.csv"
    n_test = 3775006
    s = 50000
    skip_test = sorted(random.sample(range(1,n_test+1),n_test-s))

print("Number of rows in the train data file:", n_train)
print("Number of rows we are going to elimiate in train data are",len(skip_train))
print("Number of rows in the test data file:", n_test)
print("Number of rows we are going to elimiate in test data are",len(skip_test))

Number of rows in the train data file: 15100028
Number of rows we are going to elimiate in train data are 15000028
Number of rows in the test data file: 3775006
Number of rows we are going to elimiate in test data are 3725006


In [19]:
df_final_train = pd.read_csv('data/train_x.csv', skiprows=skip_train, names=['source_node', 'destination_node'])
df_final_train['indicator_link'] = pd.read_csv('data/train_y.csv', skiprows=skip_train, names=['indicator_link'])
print("Train matrix size ",df_final_train.shape)
df_final_train.head(5)

Train matrix size  (100002, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,365363,425182,1
1,1746765,501746,1
2,117292,783587,1
3,162833,1385156,1
4,1818705,1699250,1


In [20]:
df_final_test = pd.read_csv('data/test_x.csv', skiprows=skip_test, names=['source_node', 'destination_node'])
df_final_test['indicator_link'] = pd.read_csv('data/test_y.csv', skiprows=skip_test, names=['indicator_link'])
print("Test matrix size ",df_final_test.shape)
df_final_test.head(5)

Test matrix size  (50002, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,1696056,1165625,1
1,398614,946454,1
2,719206,276747,1
3,551027,1787160,1
4,484859,1056642,1


In [21]:
# Adding a set of features

if not os.path.isfile('data/storage_sample_stage1.h5'):
    df_final_train['jaccard_followers'] = df_final_train.apply(lambda row:followers_jaqScore
                                                               (row['source_node'],row['destination_node']),axis=1)
    
    df_final_test['jaccard_followers'] = df_final_test.apply(lambda row:followers_jaqScore
                                                             (row['source_node'],row['destination_node']),axis=1)

    df_final_train['jaccard_followees'] = df_final_train.apply(lambda row:followees_jaqScore
                                                               (row['source_node'],row['destination_node']),axis=1)
    
    df_final_test['jaccard_followees'] = df_final_test.apply(lambda row:followees_jaqScore
                                                             (row['source_node'],row['destination_node']),axis=1)
    
    df_final_train['cosine_followers'] = df_final_train.apply(lambda row:followers_cosScore
                                                              (row['source_node'],row['destination_node']),axis=1)
    
    df_final_test['cosine_followers'] = df_final_test.apply(lambda row:followers_cosScore
                                                            (row['source_node'],row['destination_node']),axis=1)

    df_final_train['cosine_followees'] = df_final_train.apply(lambda row:followees_cosScore
                                                              (row['source_node'],row['destination_node']),axis=1)
    
    df_final_test['cosine_followees'] = df_final_test.apply(lambda row:followees_cosScore
                                                            (row['source_node'],row['destination_node']),axis=1)
    
    
    df_final_train['num_followers_s'], df_final_train['num_followers_d'], \
    df_final_train['num_followees_s'], df_final_train['num_followees_d'], \
    df_final_train['inter_followers'], df_final_train['inter_followees']= compute_features_stage1(df_final_train)
    
    
    df_final_test['num_followers_s'], df_final_test['num_followers_d'], \
    df_final_test['num_followees_s'], df_final_test['num_followees_d'], \
    df_final_test['inter_followers'], df_final_test['inter_followees']= compute_features_stage1(df_final_test)
    
    
    hdf = pd.HDFStore('data/storage_sample_stage1.h5')
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()
else:
    df_final_train = pd.read_hdf('data/storage_sample_stage1.h5', 'train_df',mode='r')
    df_final_test = pd.read_hdf('data/storage_sample_stage1.h5', 'test_df',mode='r')


In [22]:
# Adding new set of features

if not os.path.isfile('data/storage_sample_stage2.h5'):
    #mapping adar index on train
    df_final_train['adar_index'] = df_final_train.apply(lambda row: adar_index(row['source_node'],row['destination_node']),axis=1)
    #mapping adar index on test
    df_final_test['adar_index'] = df_final_test.apply(lambda row: adar_index(row['source_node'],row['destination_node']),axis=1)

    #--------------------------------------------------------------------------------------------------------
    #mapping followback or not on train
    df_final_train['follows_back'] = df_final_train.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)

    #mapping followback or not on test
    df_final_test['follows_back'] = df_final_test.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)

    #--------------------------------------------------------------------------------------------------------
    #mapping same component of wcc or not on train
    df_final_train['same_comp'] = df_final_train.apply(lambda row: belongs_toWeakConnec(row['source_node'],row['destination_node']),axis=1)

    ##mapping same component of wcc or not on train
    df_final_test['same_comp'] = df_final_test.apply(lambda row: belongs_toWeakConnec(row['source_node'],row['destination_node']),axis=1)
    
    #--------------------------------------------------------------------------------------------------------
    #mapping shortest path on train 
    df_final_train['shortest_path'] = df_final_train.apply(lambda row: compute_shortestPath(row['source_node'],row['destination_node']),axis=1)
    #mapping shortest path on test
    df_final_test['shortest_path'] = df_final_test.apply(lambda row: compute_shortestPath(row['source_node'],row['destination_node']),axis=1)

    hdf = pd.HDFStore('data/storage_sample_stage2.h5')
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()
else:
    df_final_train = pd.read_hdf('data/storage_sample_stage2.h5', 'train_df',mode='r')
    df_final_test = pd.read_hdf('data/storage_sample_stage2.h5', 'test_df',mode='r')


In [23]:
# Weight Features for source and destination of each link
Weight_in = {}
Weight_out = {}
for i in  tqdm(train_graph.nodes()):
    s1=set(train_graph.predecessors(i))
    w_in = 1.0/(np.sqrt(1+len(s1)))
    Weight_in[i]=w_in
    
    s2=set(train_graph.successors(i))
    w_out = 1.0/(np.sqrt(1+len(s2)))
    Weight_out[i]=w_out
    
#for imputing with mean
mean_weight_in = np.mean(list(Weight_in.values()))
mean_weight_out = np.mean(list(Weight_out.values()))

100%|████████████████████████████████████████████████████████████████████| 1780938/1780938 [00:09<00:00, 191643.37it/s]


In [24]:
# Adding new set of features
if not os.path.isfile('data/storage_sample_stage3.h5'):
    #mapping to pandas train
    df_final_train['weight_in'] = df_final_train.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
    df_final_train['weight_out'] = df_final_train.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))

    #mapping to pandas test
    df_final_test['weight_in'] = df_final_test.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
    df_final_test['weight_out'] = df_final_test.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))


    #some features engineerings on the in and out weights
    df_final_train['weight_f1'] = df_final_train.weight_in + df_final_train.weight_out
    df_final_train['weight_f2'] = df_final_train.weight_in * df_final_train.weight_out
    df_final_train['weight_f3'] = (2*df_final_train.weight_in + 1*df_final_train.weight_out)
    df_final_train['weight_f4'] = (1*df_final_train.weight_in + 2*df_final_train.weight_out)

    #some features engineerings on the in and out weights
    df_final_test['weight_f1'] = df_final_test.weight_in + df_final_test.weight_out
    df_final_test['weight_f2'] = df_final_test.weight_in * df_final_test.weight_out
    df_final_test['weight_f3'] = (2*df_final_test.weight_in + 1*df_final_test.weight_out)
    df_final_test['weight_f4'] = (1*df_final_test.weight_in + 2*df_final_test.weight_out)
    
    df_final_train['page_rank_s'] = df_final_train.source_node.apply(lambda x:pr.get(x,mean_pr))
    df_final_train['page_rank_d'] = df_final_train.destination_node.apply(lambda x:pr.get(x,mean_pr))

    df_final_test['page_rank_s'] = df_final_test.source_node.apply(lambda x:pr.get(x,mean_pr))
    df_final_test['page_rank_d'] = df_final_test.destination_node.apply(lambda x:pr.get(x,mean_pr))
    #================================================================================

    #Katz centrality score for source and destination in Train and test
    #if anything not there in train graph then adding mean katz score
    df_final_train['katz_s'] = df_final_train.source_node.apply(lambda x: katz.get(x,mean_katz))
    df_final_train['katz_d'] = df_final_train.destination_node.apply(lambda x: katz.get(x,mean_katz))

    df_final_test['katz_s'] = df_final_test.source_node.apply(lambda x: katz.get(x,mean_katz))
    df_final_test['katz_d'] = df_final_test.destination_node.apply(lambda x: katz.get(x,mean_katz))
    #================================================================================

    #Hits algorithm score for source and destination in Train and test
    #if anything not there in train graph then adding 0
    df_final_train['hubs_s'] = df_final_train.source_node.apply(lambda x: hits[0].get(x,0))
    df_final_train['hubs_d'] = df_final_train.destination_node.apply(lambda x: hits[0].get(x,0))

    df_final_test['hubs_s'] = df_final_test.source_node.apply(lambda x: hits[0].get(x,0))
    df_final_test['hubs_d'] = df_final_test.destination_node.apply(lambda x: hits[0].get(x,0))
    #================================================================================

    #Hits algorithm score for source and destination in Train and Test
    #if anything not there in train graph then adding 0
    df_final_train['authorities_s'] = df_final_train.source_node.apply(lambda x: hits[1].get(x,0))
    df_final_train['authorities_d'] = df_final_train.destination_node.apply(lambda x: hits[1].get(x,0))

    df_final_test['authorities_s'] = df_final_test.source_node.apply(lambda x: hits[1].get(x,0))
    df_final_test['authorities_d'] = df_final_test.destination_node.apply(lambda x: hits[1].get(x,0))
    #================================================================================

    hdf = pd.HDFStore('data/storage_sample_stage3.h5')
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()
else:
    df_final_train = pd.read_hdf('data/storage_sample_stage3.h5', 'train_df',mode='r')
    df_final_test = pd.read_hdf('data/storage_sample_stage3.h5', 'test_df',mode='r')

In [25]:
# Adding new set of features
#for svd features to get feature vector creating a dict nde val and inedx in svd vector
sadj_col = sorted(train_graph.nodes())
sadj_dict = { val:idx for idx,val in enumerate(sadj_col)}

Adj = nx.adjacency_matrix(train_graph,nodelist=sorted(train_graph.nodes())).asfptype()

U, s, V = svds(Adj, k = 6)
print('Adjacency matrix Shape',Adj.shape)
print('U Shape',U.shape)
print('V Shape',V.shape)
print('s Shape',s.shape)

Adjacency matrix Shape (1780938, 1780938)
U Shape (1780938, 6)
V Shape (6, 1780938)
s Shape (6,)


In [26]:
if not os.path.isfile('data/storage_sample_stage4.h5'):
    #===================================================================================================
    
    df_final_train[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
    df_final_train.source_node.apply(lambda x: svd(x, U,sadj_dict)).apply(pd.Series)
    
    df_final_train[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
    df_final_train.destination_node.apply(lambda x: svd(x, U,sadj_dict)).apply(pd.Series)
    #===================================================================================================
    
    df_final_train[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
    df_final_train.source_node.apply(lambda x: svd(x, V.T,sadj_dict)).apply(pd.Series)

    df_final_train[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
    df_final_train.destination_node.apply(lambda x: svd(x, V.T,sadj_dict)).apply(pd.Series)
    #===================================================================================================
    
    df_final_test[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
    df_final_test.source_node.apply(lambda x: svd(x, U,sadj_dict)).apply(pd.Series)
    
    df_final_test[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
    df_final_test.destination_node.apply(lambda x: svd(x, U,sadj_dict)).apply(pd.Series)

    #===================================================================================================
    
    df_final_test[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
    df_final_test.source_node.apply(lambda x: svd(x, V.T,sadj_dict)).apply(pd.Series)

    df_final_test[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
    df_final_test.destination_node.apply(lambda x: svd(x, V.T,sadj_dict)).apply(pd.Series)
    #===================================================================================================

    hdf = pd.HDFStore('data/storage_sample_stage4.h5')
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()

In [28]:
#Adding few more features
if not os.path.isfile('data/storage_sample_stage5.h5'):
    
         #===================================================================================================

        #mapping preferential_followers to train and test data
        df_final_train['preferential_followers'] = df_final_train.apply(lambda row:
                                                preferential_followers(row['source_node'],row['destination_node']),axis=1)
        df_final_test['preferential_followers'] = df_final_test.apply(lambda row:
                                                preferential_followers(row['source_node'],row['destination_node']),axis=1)

        #===================================================================================================

        #mapping preferential_followee to train and test data
        df_final_train['preferential_followee'] = df_final_train.apply(lambda row:
                                                preferential_followee(row['source_node'],row['destination_node']),axis=1)
        df_final_test['preferential_followee'] = df_final_test.apply(lambda row:
                                                preferential_followee(row['source_node'],row['destination_node']),axis=1)


        #===================================================================================================

        

        df_final_train['svd_dot_u'] = df_final_train['svd_u_s_1']*df_final_train['svd_u_d_1']\
        + df_final_train['svd_u_s_2']*df_final_train['svd_u_d_2']\
        + df_final_train['svd_u_s_3']*df_final_train['svd_u_d_3']\
        + df_final_train['svd_u_s_4']*df_final_train['svd_u_d_4']\
        + df_final_train['svd_u_s_5']*df_final_train['svd_u_d_5']\
        + df_final_train['svd_u_s_6']*df_final_train['svd_u_d_6']

        #===================================================================================================

         



        df_final_train['svd_dot_v'] =  df_final_train['svd_v_s_1']*df_final_train['svd_v_d_1']\
        + df_final_train['svd_v_s_2']*df_final_train['svd_v_d_2']\
        + df_final_train['svd_v_s_3']*df_final_train['svd_v_d_3']\
        + df_final_train['svd_v_s_4']*df_final_train['svd_v_d_4']\
        + df_final_train['svd_v_s_5']*df_final_train['svd_v_d_5']\
        + df_final_train['svd_v_s_6']*df_final_train['svd_v_d_6']

        #===================================================================================================
 

        df_final_test['svd_dot_u'] = df_final_test['svd_u_s_1']*df_final_test['svd_u_d_1']\
        + df_final_test['svd_u_s_2']*df_final_test['svd_u_d_2']\
        + df_final_test['svd_u_s_3']*df_final_test['svd_u_d_3']\
        + df_final_test['svd_u_s_4']*df_final_test['svd_u_d_4']\
        + df_final_test['svd_u_s_5']*df_final_test['svd_u_d_5']\
        + df_final_test['svd_u_s_6']*df_final_test['svd_u_d_6']
        #===================================================================================================

        df_final_test['svd_dot_v']  = df_final_test['svd_v_s_1']*df_final_test['svd_v_d_1']\
        + df_final_test['svd_v_s_2']*df_final_test['svd_v_d_2']\
        + df_final_test['svd_v_s_3']*df_final_test['svd_v_d_3']\
        + df_final_test['svd_v_s_4']*df_final_test['svd_v_d_4']\
        + df_final_test['svd_v_s_5']*df_final_test['svd_v_d_5']\
        + df_final_test['svd_v_s_6']*df_final_test['svd_v_d_6']

        #===================================================================================================

        hdf = pd.HDFStore('data/storage_sample_stage5.h5')
        hdf.put('train_df',df_final_train, format='table', data_columns=True)
        hdf.put('test_df',df_final_test, format='table', data_columns=True)
        hdf.close()
else:
        df_final_train = pd.read_hdf('data/storage_sample_stage5.h5', 'train_df',mode='r')
        df_final_test = pd.read_hdf('data/storage_sample_stage5.h5', 'test_df',mode='r')