In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from datetime import datetime


In [5]:
pumpkins = pd.read_csv('pumpkin.csv')

pumpkins.head()

pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]

new_columns = ['Package', 'Variety', 'City Name', 'Month', 'Low Price', 'High Price', 'Date']
pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)

price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2

month = pd.DatetimeIndex(pumpkins['Date']).month
day_of_year = pd.to_datetime(pumpkins['Date']).apply(lambda dt: (dt-datetime(dt.year,1,1)).days)

new_pumpkins = pd.DataFrame(
    {'Month': month, 
     'DayOfYear' : day_of_year, 
     'Variety': pumpkins['Variety'], 
     'City': pumpkins['City Name'], 
     'Package': pumpkins['Package'], 
     'Low Price': pumpkins['Low Price'],
     'High Price': pumpkins['High Price'], 
     'Price': price})

new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/1.1
new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price*2

new_pumpkins.head()

Unnamed: 0,Month,DayOfYear,Variety,City,Package,Low Price,High Price,Price
70,9,267,PIE TYPE,BALTIMORE,1 1/9 bushel cartons,15.0,15.0,13.636364
71,9,267,PIE TYPE,BALTIMORE,1 1/9 bushel cartons,18.0,18.0,16.363636
72,10,274,PIE TYPE,BALTIMORE,1 1/9 bushel cartons,18.0,18.0,16.363636
73,10,274,PIE TYPE,BALTIMORE,1 1/9 bushel cartons,17.0,17.0,15.454545
74,10,281,PIE TYPE,BALTIMORE,1 1/9 bushel cartons,15.0,15.0,13.636364


In [26]:
X = pd.get_dummies(new_pumpkins['Variety']) \
    .join(new_pumpkins['Month']) \
    .join(pd.get_dummies(new_pumpkins['City'])) \
    .join(pd.get_dummies(new_pumpkins['Package']))
print(X)

      FAIRYTALE  MINIATURE  MIXED HEIRLOOM VARIETIES  PIE TYPE  Month  \
70            0          0                         0         1      9   
71            0          0                         0         1      9   
72            0          0                         0         1     10   
73            0          0                         0         1     10   
74            0          0                         0         1     10   
...         ...        ...                       ...       ...    ...   
1738          0          1                         0         0      9   
1739          0          1                         0         0      9   
1740          0          1                         0         0      9   
1741          0          1                         0         0      9   
1742          0          1                         0         0      9   

      ATLANTA  BALTIMORE  BOSTON  CHICAGO  COLUMBIA  DETROIT  NEW YORK  \
70          0          1       0        0        

In [27]:
y = new_pumpkins['Price']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)

(332, 20) (332,)


In [29]:
linearModal = LinearRegression()
linearModal.fit(X_train, y_train)

In [30]:
y_pred = linearModal.predict(X_test)

In [39]:
mse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Mean Squared Error: {mse} ({mse/np.mean(y_pred)*100:3.3}%)')


Mean Squared Error: 2.2238953838594293 (8.6%)


# Polynomial regression

In [32]:
pipeline = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
pipeline.fit(X_train, y_train)

poly_pred = pipeline.predict(X_test)

In [38]:
mse = np.sqrt(mean_squared_error(y_test, poly_pred))
print(f'Mean Squared Error: {mse} ({mse/np.mean(y_pred)*100:3.3}%)')
score = pipeline.score(X_train,y_train)
print('Model determination: ', score)

Mean Squared Error: 153109505411.193 (5.92e+11%)
Model determination:  0.9556919856788849
