In [1]:
import pandas as pd
from sklearn import linear_model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import geopandas
from matplotlib import pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler

  shapely_geos_version, geos_capi_version_string


In [14]:
def train_model(df):
    
    xcols = ["Install_year","CI","DI","SPUN","SAND","years_since_break","prior_breaks","age"]
    ycol = "y"
    
    df.drop("Unnamed: 0",axis = 1,inplace = True)
    df = df[df["Install_year"].notnull()].copy()
    df["age"]=df["year"]-df["Install_year"]
    df["years_since_break"]=df["year"]-df["last_break_yr"]
    df[ycol] = df[ycol].astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(df[xcols], df[ycol], test_size=0.2, random_state = 63)
    poly=xcols
    trans = make_column_transformer((PolynomialFeatures(degree=2), poly),
                                remainder="passthrough"
                                )
    pipe = Pipeline([
                ("trans", trans),
                ("scaler",StandardScaler()),
                ("lr", LogisticRegression(max_iter=200)),
                ])
    pipe.fit(X_train,y_train)
    return pipe

In [4]:
save_columns = ["OBJECTID_r","segment_id","segment_na","from_segme","to_segment","pvmt_ratin","curb_ratin","geometry"]

In [5]:
def format_predict(df,predictor):
    df = df[df["OBJECTID_a"].notnull()].copy()
    df["Install_year"] = df["OBJECTID_r"].apply(lambda obj_id: df[df["OBJECTID_r"]==obj_id]["InstallYea"].mean())
    df["last_break_yr"] = df["OBJECTID_r"].apply(lambda obj_id: df[df["OBJECTID_r"]==obj_id]["break_year"].max())
    df["prior_breaks"] = df["OBJECTID_r"].apply(lambda obj_id: len(df[df["OBJECTID_r"]==obj_id]))
    for m in ["CI","DI","SPUN","SAND"]:
        df[m] = df["OBJECTID_r"].apply(lambda obj_id: int(m in df[df["OBJECTID_r"]==obj_id]["Material"].values))
    df["year"] = 2020
    df = df[save_columns+xcols].copy()
    df.drop_duplicates("OBJECTID_r",keep = "last",inplace = True)
    df["prediction"] = predictor.predict_proba(df[xcols])[:,1]
    return df[save_columns+["prediction"]]

In [6]:
def format_nvr_broken(df,break_rate):
    df = df[df["OBJECTID_a"].isnull()].copy()
    df["prediction"] = break_rate
    df.drop_duplicates("OBJECTID_r",inplace = True)
    return df[save_columns+["prediction"]]

In [7]:
road = geopandas.read_file("../dataset/water_mains_with_roads")

In [8]:
def make_prediction_map(df1,df2,ax,future):
    df = df1.append(df2,ignore_index = True)
    df.to_file(str(future)+"yr_prediction")
    df.plot(column = "prediction",legend = True,ax=ax,cmap = "Reds",vmin = 0,vmax = 1)
    ax.axis("off")
    ax.set_title(str(future)+" Years Road Risk Level Prediction")

In [9]:
fresh_break_rate = {1:2154/(37247+2154),5:2154/(29480+2154),10:2154/(21505+2154),20:2154/(9830+2154)}

In [10]:
fresh_break_rate.keys()

dict_keys([1, 5, 10, 20])

In [11]:
def pvmt_rating_plot(df,ax):
    color_map = plt.cm.get_cmap('Blues') 
    reversed_color_map = color_map. reversed() 
    df.plot(column = "pvmt_ratin",ax = ax,cmap = reversed_color_map,legend = True)
    ax.axis("off")

In [12]:
pixel = geopandas.read_file("zip://../dataset/Hi.zip")
def overlap(interval,ax):
    df = geopandas.read_file(str(interval)+"yr_prediction")
    df=df[["geometry","prediction","pvmt_ratin"]]
    joined = geopandas.sjoin(pixel, df, how="inner", op='intersects')
    joined = joined[["geometry","prediction"]]
    joined["index"]=joined.index
    joined=joined.groupby(by='index').agg({'prediction': 'max','geometry':'first'}).reset_index()
    joined=joined[["geometry","prediction"]]
    joined=geopandas.GeoDataFrame(joined)
    joined.plot(column="prediction",cmap = "Greys",vmin = 0,vmax = 1,ax=ax,legend = True,legend_kwds={'label': "Break Probability"})
    df.plot(column="pvmt_ratin",cmap="Reds_r",ax=ax,legend = True,legend_kwds={'label': "Pavement Rating"})
    ax.axis("off")

In [16]:
df = pd.read_csv("../dataset/ML_5yr_dataset.csv")
predictor = train_model(df)
first_break = format_nvr_broken(road,fresh_break_rate)
multiple_break = format_predict(road,predictor)

0.8180677540777918
{'fit_time': array([0.01020098, 0.00979161, 0.0099628 , 0.00999284, 0.00978684,
       0.00970173, 0.00996184, 0.00989866, 0.00988889, 0.01031041]), 'score_time': array([0.00239968, 0.00220346, 0.00229025, 0.00238514, 0.00227857,
       0.00221038, 0.00232267, 0.00227499, 0.00245309, 0.00241804]), 'test_score': array([0.79623824, 0.81818182, 0.79623824, 0.79937304, 0.80877743,
       0.78369906, 0.80564263, 0.78930818, 0.77044025, 0.8427673 ])}


ValueError: Length of values (4) does not match length of index (7723)