In [353]:
import pandas as pd
import math
import networkx as nx
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

In [158]:
#dummies
df = pd.read_csv(r'Mid07new_with_dist1_dist10_small.txt',sep=',')
df = df[['realprice1', 'realprice2', 'realprice3', 'realprice4',
       'br1', 'br2', 'br3', 'br4', 'residual.price.1', 'residual.price.2',
       'residual.price.3', 'residual.price.4','TRADEMANUFACTURER',
       'TRADENAME', 'TRADEMODEL', 'TRADEORIGIN', 'TRADEBODYTP',
       'TRADECYLINDER',"CUSTLATITUDE","CUSTLONGITUDE",'ZIPCODE','brand']].copy()
df = pd.get_dummies(df,drop_first=True)

In [7]:
#latlon data prepare
df = df[["CUSTLATITUDE","CUSTLONGITUDE",'brand']].copy()
sort_lat = df.sort_values(by=['CUSTLATITUDE']).reset_index()
sort_lon = df.sort_values(by=['CUSTLONGITUDE']).reset_index()

In [123]:
def toDict(df,dist):
    graph_dict = {}
    for index, row in df.iterrows():
        neighbors = getNeighbors(row['CUSTLATITUDE'],row['CUSTLONGITUDE'],dist)
        graph_dict[index] = neighbors
    return graph_dict
def cal_delta(lat1,lon1,d,brng, latorlon):
    R = 6378.1 #Radius of the Earth
    brng = math.radians(brng) #Bearing is 90 degrees converted to radians.

    lat1 = math.radians(lat1) #Current lat point converted to radians
    lon1 = math.radians(lon1) #Current long point converted to radians

    lat2 = math.asin( math.sin(lat1)*math.cos(d/R) + math.cos(lat1)*math.sin(d/R)*math.cos(brng))
    lon2 = lon1 + math.atan2(math.sin(brng)*math.sin(d/R)*math.cos(lat1),math.cos(d/R)-math.sin(lat1)*math.sin(lat2))

    lat2 = math.degrees(lat2)
    lon2 = math.degrees(lon2)
    
    if latorlon == 'lat':
        return lat2
    else: return lon2

In [128]:
def getNeighbors(lat,lon,dist):
    lat_n = cal_delta(lat,lon,dist,0,'lat') #north bound
    lat_s = cal_delta(lat,lon,dist,180,'lat') #south bound
    lon_e = cal_delta(lat,lon,dist,90,'lon')
    lon_w = cal_delta(lat,lon,dist,270,'lon')
    #print(lat_n,lat_s,lon_e,lon_w)
    
    lat_l = sort_lat['CUSTLATITUDE'].searchsorted(lat_s,side = 'left')
    lat_r = sort_lat['CUSTLATITUDE'].searchsorted(lat_n,side = 'right')
    lon_l = sort_lon['CUSTLONGITUDE'].searchsorted(lon_w,side = 'left')
    lon_r = sort_lon['CUSTLONGITUDE'].searchsorted(lon_e,side = 'right')
    #print(lat_l,lat_r,lon_l,lon_r)
    
    lat_set = set(sort_lat[lat_l:lat_r]['index'])
    #print(sort_lat[lat_l:lat_r])
    
    lon_set = set(sort_lon[lon_l:lon_r]['index'])
    #print(sort_lon[lon_l:lon_r])
    final_set = lat_set.intersection(lon_set)
    return final_set

In [157]:
#latlong dictionary
latlon_dict = toDict(df,5)
for key in latlon_dict:
    latlon_dict[key].remove(key)

In [159]:
#zip dictionary
df = df.reset_index()
zip_dict =  df.groupby('ZIPCODE')['index'].apply(list).to_dict()
zip_index_dict = {}
for key in zip_dict:
    index_set = zip_dict[key]
    for index in index_set:
        zip_index_dict[index] = set(index_set)
for key in zip_index_dict:
    zip_index_dict[key].remove(key)

In [177]:
def StratifiedSampling(df, test_size):
    X = df.drop(['brand'], axis=1)
    y = df.brand
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0)    
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        train = df.iloc[train_index,:]
        test = df.iloc[test_index,:]
    return train,test
train,test = StratifiedSampling(df, 0.2)

In [550]:
#dict to graph
G = nx.from_dict_of_lists(latlon_dict)
for index,row in train.iterrows():
    G.nodes[index]['label'] = row.brand
    G.nodes[index]['train'] = 1
for index,row in test.iterrows():
    G.nodes[index]['label'] = 0
    G.nodes[index]['train'] = 0
H = G.copy()

In [551]:
def propagate_graph(G):
    count=0
    for node in list(G.nodes):
        if G.nodes[node]['train']==1: 
            count+=1
            continue #leave out train node
        neighbors = list(G.neighbors(node))
        #if node == 10: print(neighbors)
        if not neighbors: #assign majority probability for node without neighbors
            H.nodes[node]['prob_dist'] = [0.25,0.25,0.25,0.25]
            H.nodes[node]['label'] = 1
        #if node == 10: print(G.nodes[node])
        else:
            train_nodes=[]
            for neigh in neighbors: #assgin probaility distribution from labled neighbors
                if G.nodes[neigh]['label']!=0:
                    train_nodes.append(G.nodes[neigh]['label'])
            #if node == 10: print(train_nodes)
            if train_nodes:
                b_prob = [float(train_nodes.count(i))/len(train_nodes) for i in [1,2,3,4]]
                #print(node,b_prob)
                H.nodes[node]['prob_dist'] = b_prob
                H.nodes[node]['label'] = b_prob.index(max(b_prob))+1
                #print(H.nodes[node])
        #if node == 10: print(G.nodes[node])
        if G.nodes[node]['label']!=0:
            count+=1
        #else: 
            #print(node,G.nodes[node])
    return count

def MAD(norm_pred,y_test):
    MAD_list=[]
    for i in range(len(y_test)):
        mad_row = [1-n if j==y_test[i]-1 else n for j, n in enumerate(norm_pred[i]) ]
        MAD_list.append(sum(mad_row))
    return sum(MAD_list)/len(MAD_list)

In [552]:
#propagate
last_count = 0
labeled_count = len(train)
loop = 1
while loop:
    labeled_count = propagate_graph(G)
    G=H.copy()
    #print(labeled_count)
    if labeled_count>last_count: 
        last_count = labeled_count
    else: 
        loop = 0
    
for node in list(G.nodes): #assign to node with all neighbors in test
    if G.nodes[node]['label']==0:
        print(node)
        G.nodes[node]['prob_dist'] = [0.25,0.25,0.25,0.25]
        G.nodes[node]['label'] = 1

168
410


In [556]:
class predClass:
    def __init__(self, index, dist, label):
        self.index = index
        self.dist = dist
        self.label = label
        pred_label_list = [0]*4
        pred_label_list[label-1]=1
        self.label_list = pred_label_list

pred_list=[]        
for node in list(G.nodes):
    if G.nodes[node]['train']==0:
        pred_obj = predClass(node, G.nodes[node]['prob_dist'], G.nodes[node]['label'])
        pred_list.append(pred_obj)
pred_list.sort(key=lambda x: x.index)

In [557]:
sortedtest = test.sort_index()
y_test = sortedtest.brand
print('normMAD: ', round(MAD([x.dist for x in pred_list],list(y_test)),2))
print('0or1MAD: ', round(MAD([x.label_list for x in pred_list],list(y_test)),2))
print('accuracy: ', round(accuracy_score(y_test, [x.label for x in pred_list]),2))

normMAD:  1.01
0or1MAD:  0.81
accuracy:  0.59


In [469]:
[x.dist for x in pred_list]

[[0.6666666666666666,
  0.13333333333333333,
  0.13333333333333333,
  0.06666666666666667],
 [0.84, 0.08, 0.08, 0.0],
 [0.7368421052631579, 0.05263157894736842, 0.21052631578947367, 0.0],
 [0.625, 0.2916666666666667, 0.08333333333333333, 0.0],
 [0.18518518518518517, 0.037037037037037035, 0.7777777777777778, 0.0],
 [0.7777777777777778, 0.05555555555555555, 0.16666666666666666, 0.0],
 [0.08333333333333333, 0.5833333333333334, 0.3333333333333333, 0.0],
 [0.2857142857142857,
  0.2857142857142857,
  0.2857142857142857,
  0.14285714285714285],
 [0.15384615384615385, 0.15384615384615385, 0.6923076923076923, 0.0],
 [0.3333333333333333, 0.0, 0.6666666666666666, 0.0],
 [0.7, 0.05, 0.2, 0.05],
 [1.0, 0.0, 0.0, 0.0],
 [0.8333333333333334, 0.0, 0.125, 0.041666666666666664],
 [0.13793103448275862, 0.5517241379310345, 0.3103448275862069, 0.0],
 [0.5833333333333334, 0.08333333333333333, 0.3333333333333333, 0.0],
 [0.07142857142857142, 0.07142857142857142, 0.8571428571428571, 0.0],
 [0.8181818181818182

In [368]:
def verify_webpage_numbers(G):
    num_edge = G.number_of_edges()
    num_node = G.number_of_nodes()
    avg_coe = nx.average_clustering(G)
    t = nx.triangles(G)    
    total_t = sum(t.values())
    A = (G.subgraph(c) for c in nx.connected_components(G))
    print('number of node:',num_node,'number of edge:',num_edge,'average coefficient:',avg_coe,'. number of triangles:', total_t)
verify_webpage_numbers(G)

number of node: 821 number of edge: 1162 average coefficient: 0.6650426309378806 . number of triangles: 4044


In [428]:
#test graph
count=0
for node in list(G.nodes):
    neighbors = list(G.neighbors(node))
    #if not neighbors:
        #print(node)
        #print(G.nodes[node])
        
    if G.nodes[node]['label']!=0:
        count+=1
    else:
        print('111',node)
        print('111',G.nodes[node])
print(count)

821


In [None]:
#dict to graph
G = nx.from_dict_of_lists(latlon_dict)
for index,row in train.iterrows():
    G.nodes[index]['label'] = row.brand
    G.nodes[index]['train'] = 1
for index,row in test.iterrows():
    G.nodes[index]['label'] = 0
    G.nodes[index]['train'] = 0
H = G.copy()