## DASK

In [1]:
import dask 
import dask.dataframe as dd
import dask.array as da

In [2]:
df = dd.read_csv('BF_train.csv')

In [3]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


## Data Exploration

In [4]:
df.isnull().sum().compute()/(len(df))

User_ID                       0.000000
Product_ID                    0.000000
Gender                        0.000000
Age                           0.000000
Occupation                    0.000000
City_Category                 0.000000
Stay_In_Current_City_Years    0.000000
Marital_Status                0.000000
Product_Category_1            0.000000
Product_Category_2            0.315666
Product_Category_3            0.696727
Purchase                      0.000000
dtype: float64

In [5]:
df.describe().compute()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [6]:
df.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [7]:
len(df), len(df.columns)

(550068, 12)

In [8]:
df = df.drop(['Product_ID'], axis=1)

## Data Preprocessing

In [9]:
df = dd.get_dummies(df.categorize()).compute()

In [10]:
df.head()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender_F,Gender_M,Age_0-17,...,Age_36-45,Age_18-25,City_Category_A,City_Category_C,City_Category_B,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_4+,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_0
0,1000001,10,0,3,,,8370,1,0,1,...,0,0,1,0,0,1,0,0,0,0
1,1000001,10,0,1,6.0,14.0,15200,1,0,1,...,0,0,1,0,0,1,0,0,0,0
2,1000001,10,0,12,,,1422,1,0,1,...,0,0,1,0,0,1,0,0,0,0
3,1000001,10,0,12,14.0,,1057,1,0,1,...,0,0,1,0,0,1,0,0,0,0
4,1000002,16,0,8,,,7969,0,1,0,...,0,0,0,1,0,0,1,0,0,0


In [11]:
df = df.fillna(value=0)

In [12]:
x = df[['Occupation', 'Marital_Status', 'Product_Category_1','Gender_F','Product_Category_2', 'Product_Category_3',
        'Gender_M', 'Age_0-17', 'Age_55+', 'Age_26-35', 'Age_46-50', 'Age_51-55', 'Age_36-45', 'Age_18-25', 
        'City_Category_A','City_Category_C', 'City_Category_B', 'Stay_In_Current_City_Years_2',
        'Stay_In_Current_City_Years_4+', 'Stay_In_Current_City_Years_3','Stay_In_Current_City_Years_1', 
        'Stay_In_Current_City_Years_0']]
y = df['Purchase']

In [14]:
from dask_ml.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [16]:
len(X_train), len(X_test), len(y_train), len(y_test)

(495061, 55007, 495061, 55007)

### Dask LR

In [17]:
training_x = X_train.values
training_y = y_train.values

In [18]:
testing_x = X_test.values
testing_y = y_test.values

In [19]:
import numpy as np
from sklearn.metrics import mean_squared_error

In [20]:
def rmse(preds, actuals):
    error = mean_squared_error(actuals, preds)
    rmse = np.sqrt(error)
    print(rmse)

In [21]:
from dask_ml.linear_model import LinearRegression
lr = LinearRegression(random_state=1, n_jobs=-1)
lr.fit(training_x,training_y)

LinearRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                 intercept_scaling=1.0, max_iter=100, multi_class='ovr',
                 n_jobs=-1, penalty='l2', random_state=1, solver='admm',
                 solver_kwargs=None, tol=0.0001, verbose=0, warm_start=False)

In [22]:
preds= lr.predict(training_x)
rmse(preds, training_y)

4624.266265525379


In [23]:
preds_test= lr.predict(testing_x)
rmse(preds_test, testing_y)

4624.036261865467
