In [62]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import pickle

class FirstModel():
  def __init__(self, cluster):
      cluster['o_tract10'] = cluster['TRACT'].astype(float)
      self.cluster = cluster
      self.pls = PLSRegression(n_components=10)
      self.encoder = OneHotEncoder()
      self.linear = LinearRegression()
      self.knn = KNeighborsRegressor()

  def train(self, x, y, w):
    x['o_tract10'] = x[['o_tract10']].merge(self.cluster[['o_tract10', 'Cluster']], on='o_tract10', how="left")['Cluster'].fillna(-1)
    x,y,w = x.iloc[:,1:], y.iloc[:,1:].values, w.iloc[:,1:].values

    x = self.encoder.fit_transform(x.values).toarray()
    x, _ = self.pls.fit_transform(x, y)
    self.linear.fit(x,y,sample_weight = w.ravel())

  def predict(self, features, mode):
    features[-1] = self.cluster[self.cluster['o_tract10']==features[-1].astype(float)]['Cluster'].values[0]
    x = self.pls.transform(self.encoder.transform([features]).toarray())
    return self.linear.predict(x)[0]



class ThirdModel():
    def __init__(self, cluster, inter, outer):
        self.cluster = cluster
        self.inter = inter
        self.outer = outer
        self.inter_model = PLSRegression(n_components=10)
        self.outer_model = PLSRegression(n_components=10)
        self.encoder_x = OneHotEncoder()
        self.encoder_y = OneHotEncoder()

    def train(self, inter_x, inter_y, outer_x, outer_y):
      self.encoder_x.fit_transform(pd.concat([inter_x.iloc[:,:10], outer_x.iloc[:,:10]], axis=0).values)
      inter_x_soc = pd.DataFrame(self.encoder_x.transform(inter_x.iloc[:,:10].values).toarray())
      inter_x = pd.concat([inter_x_soc, inter_x.iloc[:,10:]], axis=1)
      outer_x_soc = pd.DataFrame(self.encoder_x.transform(outer_x.iloc[:,:10].values).toarray()).reset_index().drop(columns="index")
      outer_x = outer_x.reset_index().drop(columns="index")
      outer_x = pd.concat([outer_x_soc, outer_x.iloc[:,10:]], axis=1)
      y = pd.concat([inter_y, outer_y], axis=0).reset_index()
      y = y.pivot_table(index=y.index,columns="main_mode",values="weight").fillna(0)
      inter_len = len(inter_y)
      inter_y = y.iloc[:inter_len, :]
      outer_y = y.iloc[inter_len:, :]
      self.inter_model.fit(inter_x.values, inter_y.values)
      self.outer_model.fit(outer_x.values, outer_y.values)

    def predict(self, features):
      val = self.encoder_x.transform([features[:-2]])[0].toarray()[0].tolist()
      if features[-2] == features[-1]:
        att = self.cluster.merge(self.inter, on='Cluster')
        value = att[att['TRACT'] == features[-1]].iloc[:,2:].values.tolist()[0]
        for ele in value:
          val.append(ele)
        result = self.inter_model.predict([val])
      else:
        att = self.cluster.merge(self.outer, on='TRACT', how='left')
        ct = features[-2]
        value_o = att[att['TRACT']==ct][['centroid_x','centroid_y']].values[0]
        ct = features[-1]
        value_d = att[att['TRACT']==ct][['centroid_x','centroid_y']].values[0]
        value = np.sqrt(sum((value_o-value_d)**2))
        val.append(value)
        result = self.outer_model.predict([val])
      return result[0]

In [63]:
cluster = pd.read_csv("clustering.csv", index_col=0)
x = pd.read_csv("x.csv", index_col=0)
predict_feature = x.iloc[1,1:].values
y = pd.read_csv("y.csv", index_col=0)
w = pd.read_csv("w.csv", index_col=0)
first_model = FirstModel(cluster)
first_model.train(x,y,w)

first_model.predict(predict_feature,"random")

array([0.06070858, 1.65950763, 1.1624303 , 0.00628228, 0.39947489,
       0.37797847])

In [59]:
predict_feature

array(['2 vehicles', '$75,000-$99,999', 'No', 2, 0, '65 years+', 'Male',
       'Retired', 'Some college',
       'Yes, has an intermediate or unrestricted license',
       149    5
       Name: Cluster, dtype: int64], dtype=object)

In [None]:
cluster = pd.read_csv("clustering.csv", index_col=0)
inter = pd.read_csv("inter_cluster.csv", index_col=0)
outer = pd.read_csv("cross_centroid.csv", index_col=0)
inter_x = pd.read_csv("inter_x.csv", index_col=0)
inter_y = pd.read_csv("inter_y.csv", index_col=0)
outer_x = pd.read_csv("outer_x.csv", index_col=0)
outer_y = pd.read_csv("outer_y.csv", index_col=0)

third_model = ThirdModel(cluster, inter, outer)
third_model.train(inter_x, inter_y, outer_x, outer_y)

with open('third_model.pkl', 'wb') as f:
    pickle.dump(third_model, f)