In [2]:
import pandas as pd
import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.externals import joblib

import pipenv.config as cfg
import datetime

class Rank_apts():
    def __init__(self, path_read, path_write):
        self.path_read= path_read
        self.path_write= path_write
        self.features_selected= cfg.features
        
    def df_read(self, path_read):
        df= pd.read_csv(path_read, index_col = False)
        print(self.path_read)
        #print(df.columns)
        return df

    def df_write(self, df, path_write):
        df.to_csv(path_write, index=False)
        print(path_write)    
    
    def prepare_data(self, df):
        #reset index to make consequence 0,1,2,3,4
        df= df.reset_index()
        index2id= df['id']
        
        #get selected feature
        cols= [x[0] for x in self.features_selected]
        df_selected_features= df[['id']+cols]
        self.df_write( df[['id']+cols], "pipenv/tmp/df_selected.csv")
        
        print(cols)
        df= df[cols]
        
        #set missing value as mean
        df= df.fillna(df.mean())       
        #df= df.dropna()
        
        #normalization 
        #col= cols[1:] #remove 'id'
        df = df/df.max().astype(np.float16)
        
        #add weight
        for col, w in self.features_selected:
            if col in cols:
                df[col]= df[col]*w
        #print(df.head())
        print(df.shape)
        return df_selected_features, df.as_matrix(), index2id  #matrix, index map apt id #
    
    def KNN(self, X):
        nbrs = NearestNeighbors(n_neighbors=100, algorithm='ball_tree').fit(X)
        start_time= datetime.datetime.now()
        distances, indices = nbrs.kneighbors(X)
        
        print("\nmodel training time taken: ", (datetime.datetime.now()-start_time), "h:m:s")
        print("this is off line training, not request\n")
        return distances, indices    
    
    def df_rank(self, indices, index2id):
        df_rank= pd.DataFrame(indices)
        df_rank= df_rank.applymap(lambda x: index2id[x])
        self.df_write(df_rank, self.path_write)
        return df_rank
    
    def df_distances(self, distances):
        df_distances= pd.DataFrame(distances)
        self.df_write(df_distances, "pipenv/tmp/distances.csv")
        return df_distances
        
    def analysis_distance(self, df_distances):
        df_distance_all_features_mean= df_distances.mean(axis=0)
        ax= df_distance_all_features_mean.plot(title="distance measure")
        ax.set_xlabel("rank")
        ax.set_ylabel("n dimentional distance")
        #return df_distance_all_features_mean 
    
    def nearest_neighbor_search(self, num_neighbors):
        df= self.df_read(self.path_read)
        df_selected_features, X, index2id= self.prepare_data(df)
        distances, indices= self.KNN(X)
        
        df_rank= self.df_rank(indices, index2id) 
        df_distances= self.df_distances(distances)
        
        return df_selected_features, df_rank, df_distances
if __name__ == "__main__": 
    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('path_read', help='read data')
        parser.add_argument('path_write', help='write data')   
        parser.add_argument('num_nearest_neighbor', help='num_nearest_neighbor')
        args = parser.parse_args()
        path_read_features_matrix= args.path_read_features_matrix
        path_save_rank= args.path_save_rank 
        
    except:
        path_read= "pipenv/tmp/nsproperties_apt_exclusive_is_available_list_on_web_nyc_processed.csv"
        path_write= "pipenv/result/rank.csv"
        num_nearest_neighbor= 100

    rnk= Rank_apts(path_read, path_write)
    df, df_rank, df_distances= rnk.nearest_neighbor_search(num_nearest_neighbor)
    distance_all_features_mean= rnk.analysis_distance(df_distances)

pipenv/tmp/nsproperties_apt_exclusive_is_available_list_on_web_nyc_processed.csv
pipenv/tmp/df_selected.csv
['is_rental', 'is_commercial', 'rent', 'price', 'num_bedrooms', 'area', 'x', 'y', 'num_rooms', 'num_bathrooms', 'pets', 'ownership_id', 'is_new_development', 'common_charges', 'is_furnished', 'financing_allowed', 'commission']
(451, 17)

model training time taken:  0:00:00.013151 h:m:s
this is off line training, not request





pipenv/result/rank.csv
pipenv/tmp/distances.csv


In [3]:
df_rank[:13]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,894458,544270,73289,857043,52727,855703,861741,252693,815204,813865,...,909455,802600,749743,841098,57979,792927,800682,30302,909547,58087
1,70266,595358,623128,708233,884527,863109,888808,73618,49327,889421,...,651541,792816,824037,30299,30334,792815,831715,30301,30300,884031
2,582563,737072,596782,869875,901407,808494,887351,907147,43098,908363,...,794244,908865,756720,806242,906529,909455,749743,57979,41795,900151
3,899351,73662,899332,908332,866803,887405,792851,824030,857472,792811,...,24058,909455,791229,864030,34827,34825,34826,34820,805041,66432
4,869861,892727,752238,793352,898070,742664,62334,822505,755390,847747,...,836041,817995,757712,805492,877093,375581,843360,818037,715050,838468
5,678874,63414,400355,780510,798356,871033,60380,568593,48672,873526,...,754277,876237,800108,887463,851487,737072,792816,651541,840721,824037
6,866893,653746,747125,64483,64484,451358,821477,869944,745155,745164,...,58087,841098,855701,792941,547999,458396,909349,711559,815757,808494
7,751101,766617,855699,836961,61810,838736,897320,810000,822281,847753,...,742664,817705,847411,479459,822505,847747,836041,844080,833248,858527
8,901844,843125,892722,802773,714220,838474,765525,71999,838883,841097,...,862966,799367,894844,68254,871168,809430,759853,819499,771924,800535
9,869875,901407,907147,737072,582563,887351,43098,808494,908364,908363,...,873608,908865,806242,749743,906529,57979,909455,756720,41795,629881
