In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    df.drop('sellingprice', axis=1, inplace=True)
    X = df
    
    return X,y

def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
   
    while(True):
        yield df.sample(100).reset_index(drop=True)
        
gen = partial_df()

In [3]:
X, y = test_df()

In [4]:
len(X["body"].unique())

49

In [5]:
X.isna().sum()

year              0
make             95
model            95
trim             95
body            121
transmission    593
vin               0
state             0
condition       110
odometer          0
color            11
interior         11
seller            0
mmr               0
saledate          0
dtype: int64

In [6]:
X.dtypes

year              int64
make             object
model            object
trim             object
body             object
transmission     object
vin              object
state            object
condition        object
odometer        float64
color            object
interior         object
seller           object
mmr              object
saledate         object
dtype: object

1. replace null and 'sedan' value from transmission to 'automatic'
2. convert mmr, condition to numerical (0 for nulls)
3. drop make nulls
4. perform ohe on make, body, transmission, (state)
5. join these with year, condition, mmr, odometer
6. selling price is the y

### Trying to reduce the number of unique columns

In [7]:
X["make"].nunique()

58

In [8]:
X['make'] = X['make'].str.lower()
X["make"].nunique()

46

In [9]:
X["body"].nunique()

48

In [10]:
X['body'] = X['body'].str.lower()
X["body"].nunique()

27

### Dealing with null values

In [11]:
X['transmission'] = X['transmission'].fillna('automatic').replace('sedan', 'automatic')

In [12]:
X['condition'] = pd.to_numeric(X['condition'], errors='coerce')
X['mmr'] = pd.to_numeric(X['mmr'], errors='coerce')

In [13]:
mean_cond = X['condition'].mean()
X['condition'] = X['condition'].fillna(mean_cond)

In [20]:
most_common_make = X['make'].mode()[0]
X['make'] = X['make'].fillna(most_common_make)

most_common_body = X['body'].mode()[0]
X['body'] = X['body'].fillna(most_common_body)

mean_mmr = X['mmr'].mean()
X['mmr'] = X['mmr'].fillna(mean_mmr)

### Filtering into a new df

In [37]:
df = X.loc[:, ['make', 'body', 'transmission', 'condition', 'odometer', 'mmr']]

In [38]:
df.isna().sum()

make            0
body            0
transmission    0
condition       0
odometer        0
mmr             0
dtype: int64

### Performing one hot encoding, and standard scaling on data

In [39]:
df = pd.get_dummies(df, columns=['make', 'body', 'transmission'])

In [42]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['condition', 'odometer', 'mmr']] = scaler.fit_transform(df[['condition', 'odometer', 'mmr']])

In [44]:
df.shape

(5000, 79)

### Fitting the model

In [45]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(alpha = 0.001)
batch_size = 10
for i in range(0, len(df), batch_size):
    X_batch = df[i:i+batch_size]
    y_batch = y[i:i+batch_size]
    sgd.partial_fit(X_batch, y_batch)

In [46]:
acc = sgd.score(df, y)
print(acc)

0.9679654854828814


### Saving as pickle file

In [48]:
import pickle

with open('sgd_model.pkl', 'wb') as file:
    pickle.dump(sgd, file)