# Demo Collab with Bigquery and Google Storage

In [46]:
# Dependencies for training the model
import pandas as pd
from google.cloud import bigquery
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

## Data Prep

In [49]:
# Fetching data with google big query sdk
bq_client = bigquery.Client()
query = "SELECT * FROM `prolaio-data-testing.vpasquierdemo.heart`"
dataset = bq_client.query(query).to_dataframe()
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [48]:
dataset_dummies = pd.get_dummies(
    dataset, columns=["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
)
dataset_dummies.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,63,145,233,150,2.3,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,37,130,250,187,3.5,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
2,41,130,204,172,1.4,1,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
3,56,120,236,178,0.8,1,0,1,0,1,...,1,1,0,0,0,0,0,0,1,0
4,57,120,354,163,0.6,1,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0


In [50]:
standard_scaler = StandardScaler()
columns_to_scale = ["age", "trestbps", "chol", "thalach", "oldpeak"]
dataset_dummies[columns_to_scale] = standard_scaler.fit_transform(
    dataset_dummies[columns_to_scale]
)
dataset_dummies.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.952197,0.763956,-0.256334,0.015443,1.087338,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,-1.915313,-0.092738,0.072199,1.633471,2.122573,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
2,-1.474158,-0.092738,-0.816773,0.977514,0.310912,1,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
3,0.180175,-0.663867,-0.198357,1.239897,-0.206705,1,0,1,0,1,...,1,1,0,0,0,0,0,0,1,0
4,0.290464,-0.663867,2.08205,0.583939,-0.379244,1,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0


In [53]:
y = dataset_dummies["target"]
X = dataset_dummies.drop(["target"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train.shape[0])
print(X_test.shape[0])

242
61


## Random Forest


In [55]:
# Training on Random Forest model
rf_param_grid = {
    "n_estimators": range(1, 100, 10),
}
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(
    param_distributions=rf_param_grid,
    estimator=rf,
    scoring="accuracy",
    verbose=0,
    n_iter=100,
    cv=4,
)
rf_random.fit(X_train, y_train)
best_params = rf_random.best_params_
print(f"Best parameters: {best_params}")
print(f"Score: {rf_random.score(X_test, y_test)}")



Best parameters: {'n_estimators': 30}
Score: 0.819672131147541


In [None]:
# Plot results
def feature_imp(df, model):
    fi = pd.DataFrame(columns=["feature", "importance"])
    fi["feature"] = df.columns
    fi["importance"] = model.best_estimator_.feature_importances_
    return fi.sort_values(by="importance", ascending=False)

features_importance = feature_imp(X_train, rf_random)
features_importance.plot(
    "feature", "importance", "barh", figsize=(10, 7), legend=False
)

## Pushing data

In [57]:
features_importance.head()

Unnamed: 0,feature,importance
4,oldpeak,0.102463
21,ca_0,0.092961
28,thal_2,0.080438
29,thal_3,0.076439
3,thalach,0.075622


In [59]:
import io
from google.cloud import storage

# Write Parquet file
buffer = io.BytesIO()

# Put importance to zero to see the impact on bigquery
#features_importance["importance"] = features_importance["importance"].map(lambda x : 0)

# Convert dataframe in csv file
features_importance.to_csv(buffer, index=False)

# Create client
gcs_client = storage.Client()

# Configure
bucket = gcs_client.get_bucket("vpasquierdemo")
blob = bucket.blob("feat_imp.csv")

# Upload Parquet content to GCS
blob.upload_from_string(buffer.getvalue(), content_type="application/octet-stream")

## Prolaio toolkit

In [None]:
# Or you can use prolaiotoolkit to push directly the file as this
# from prolaiotoolkit.gcp.gcs_kit import write_df_as_csv
# write_df_as_csv(features_importance, bucket_name="vpasquierdemo", filename="feat_imp.csv")