In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import BallTree
from sklearn.impute import KNNImputer
from cleaning_function import *

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import BallTree
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

df = pd.read_csv('cvrm_master.csv')
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = drop_null_columns(df, 0.6)

numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
df[categorical_cols] = df[categorical_cols].fillna("missing").astype("category").apply(lambda x: x.cat.codes)

coords = np.radians(df[['Latitude', 'Longitude']].to_numpy())
tree = BallTree(coords, metric="haversine")
distances, indices = tree.query(coords, k=11) 
knn_imputer = KNNImputer(n_neighbors=10)
df_numerical_imputed = pd.DataFrame(knn_imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)
df_final = pd.concat([df_numerical_imputed, df[categorical_cols]], axis=1)


kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_rmse = []

for fold, (train_idx, test_idx) in enumerate(kf.split(df_final)):
    print(f"Starting Fold {fold + 1}")
    train_data, test_data = df_final.iloc[train_idx], df_final.iloc[test_idx]
    train_coords = np.radians(train_data[['Latitude', 'Longitude']].to_numpy())
    train_tree = BallTree(train_coords, metric="haversine")
    train_distances, train_indices = train_tree.query(train_coords, k=11)
    
    scaler = StandardScaler()
    train_data_scaled = scaler.fit_transform(train_data[numerical_cols])
    test_data_scaled = scaler.transform(test_data[numerical_cols])

    pca = PCA(n_components=20)
    train_reduced = pca.fit_transform(train_data_scaled)
    test_reduced = pca.transform(test_data_scaled)
    model = Ridge(alpha=1.0) 
    model.fit(train_reduced, train_data_scaled)  
    test_predictions = model.predict(test_reduced)

    mse = mean_squared_error(test_data_scaled, test_predictions)
    rmse = np.sqrt(mse)
    all_rmse.append(rmse)
    
    print(f"Fold {fold + 1} RMSE: {rmse}")

print(f"Average RMSE across all folds: {sum(all_rmse) / len(all_rmse)}")


  df = pd.read_csv('cvrm_master.csv')


Starting Fold 1
Fold 1 RMSE: 0.6138167568472236
Starting Fold 2
Fold 2 RMSE: 0.6134611520665076
Starting Fold 3
Fold 3 RMSE: 0.7954537505754798
Starting Fold 4
Fold 4 RMSE: 0.6089049117525633
Starting Fold 5
Fold 5 RMSE: 0.5695857826866944
Average RMSE across all folds: 0.6402444707856937


In [4]:
df_final.to_csv('geoknn.csv', index = False)


In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings("ignore")

data = df
data.replace([np.inf, -np.inf], np.nan, inplace=True)
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
data_numeric = data[numerical_cols]
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_rmse = []

for fold, (train_idx, test_idx) in enumerate(kf.split(data_numeric)):
    print(f"Starting Fold {fold + 1}")

    train_data, test_data = data_numeric.iloc[train_idx], data_numeric.iloc[test_idx]
    knn_imputer = KNNImputer(n_neighbors=10)
    train_imputed = pd.DataFrame(knn_imputer.fit_transform(train_data), columns=numerical_cols)
    test_imputed = pd.DataFrame(knn_imputer.transform(test_data), columns=numerical_cols)

    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_imputed)
    test_scaled = scaler.transform(test_imputed)
    pca = PCA(n_components=min(20, train_scaled.shape[1]))  
    train_reduced = pca.fit_transform(train_scaled)
    test_reduced = pca.transform(test_scaled)

    model = Ridge(alpha=1.0)
    model.fit(train_reduced, train_scaled)
    test_predictions = model.predict(test_reduced)
    mse = mean_squared_error(test_scaled, test_predictions)
    rmse = np.sqrt(mse)
    all_rmse.append(rmse)
    
    print(f"Fold {fold + 1} RMSE: {rmse}")
print(f"Average RMSE across all folds: {sum(all_rmse) / len(all_rmse)}")


Starting Fold 1
Fold 1 RMSE: 0.6152399756600554
Starting Fold 2
Fold 2 RMSE: 0.6149490570970514
Starting Fold 3
Fold 3 RMSE: 0.7938901198658991
Starting Fold 4
Fold 4 RMSE: 0.6104142982945249
Starting Fold 5
Fold 5 RMSE: 0.5713967125198606
Average RMSE across all folds: 0.6411780326874783


In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.neighbors import BallTree

df = pd.read_csv('cvrm_master.csv')
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = drop_null_columns(df, 0.6)

numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
df[categorical_cols] = df[categorical_cols].fillna("missing").astype("category").apply(lambda x: x.cat.codes)

coords = np.radians(df[['Latitude', 'Longitude']].to_numpy())
tree = BallTree(coords, metric="haversine")
distances, indices = tree.query(coords, k=11) 

knn_imputer = KNNImputer(n_neighbors=10)
df_numerical_imputed = pd.DataFrame(knn_imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)
df_final = pd.concat([df_numerical_imputed, df[categorical_cols]], axis=1)

df_final.to_csv("geoknn.csv", index=False)


In [11]:
df_final.isnull().sum().sum()

0