In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

In [None]:
drive.mount("/content/drive") 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Clean the data
data = pd.read_csv('/content/drive/My Drive/ks-projects-201801.csv', engine='python')
data = data.drop(columns=['goal','usd pledged', 'pledged'])
data = data.query('state != "live" and state != "undefined"')
data = data.query('usd_pledged_real < 5000000')
data = data.query('backers > 10')
data = data.query('usd_goal_real < 1000000')


In [None]:
#Setup Data
x_data = data.drop(columns=['usd_pledged_real'])
y_data = data[['usd_pledged_real']]



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
#Setup Testing Sets

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

test_ids = x_test.ID
print('x training shape =', x_train.shape)
print('y training shape =', y_train.shape)
print('x testing shape =', x_test.shape)
print('y testing shape =', y_test.shape)

x training shape = (135479, 11)
y training shape = (135479, 1)
x testing shape = (58063, 11)
y testing shape = (58063, 1)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

In [None]:
#Setup Preprocessing 
numeric_features = ['backers', 'usd_goal_real']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = [
  'category', 'main_category', 'currency', 'state', 'country']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#Setup model
regr = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', KNeighborsRegressor())])

x_train = x_train[[*numeric_features, *categorical_features]]
x_test = x_test[[*numeric_features, *categorical_features]]

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'regressor__n_neighbors': [10]
}

grid_search = GridSearchCV(
    regr, param_grid, cv=5, verbose=3, n_jobs=2
)

In [None]:
model = grid_search.fit(x_train, y_train)

In [None]:
from sklearn.metrics import r2_score

In [None]:
mpreds = model.predict(x_test)
sc = r2_score(y_test['usd_pledged_real'],mpreds)
print(sc)