In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

### Read the data 

In [None]:
cookies = pd.read_csv('cookies.csv')

In [None]:
cookies.head()

### Exploratory Data Analysis and Descriptive Statistics

In [None]:
cookies.info()

## Bake time 

In [None]:
cookies['bake time'].describe()

In [None]:
sns.pairplot(cookies[['quality', 'bake time', 'bake temp']])

In [None]:
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(cookies.corr(), dtype=np.bool))
heatmap = sns.heatmap(cookies.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')

In [None]:
num_cols = cookies.select_dtypes(include=["int64", "float64"]).columns

for column in num_cols:
    print(column, '-->', cookies['quality'].corr(cookies[column]))

## butter

In [None]:
cookies['butter type'].value_counts()

## aesthetic appeal

In [None]:
cookies['aesthetic appeal'].value_counts()


Thoughts on aesthetics : 
    not many values - drop column 

## weight

In [None]:
cookies['weight'].describe()

In [None]:
cookies['weight'].unique()

Thoughts on weight : 

    Remove -99 value

## diameter

In [None]:
cookies['diameter'].unique()

Thoughts on diameter : 

    All equal to 7 - column can be dropped 

## bake time

In [None]:
cookies['bake time'].describe()

In [None]:
cookies[cookies['bake time'].isnull()]['quality']

Thoughts on Bake time 

    judging by the spread of nulls across quality bands, any missing values could be replaced by mean or rows dropped 

## crunch factor

In [None]:
cookies['crunch factor'].plot.hist()

## mixins

In [None]:
cookies.info()

In [None]:
cookies['mixins']

In [None]:
cookies['mixins'].unique()

In [None]:
cookies['mixins'].nunique()

### Wrangling the data / making small changes 

In [None]:
cookies.drop(columns=["aesthetic appeal", "diameter"], inplace=True)

In [None]:
cookies=cookies.dropna(subset=['mixins'])

In [None]:
cookies.reset_index(drop=True, inplace=True)

In [None]:
cookies['chocolate'] = 0
cookies['raisins'] = 0
cookies['oats'] = 0
cookies['nuts'] = 0
cookies['peanut butter'] = 0

In [None]:
cookies.head()

In [None]:
mixins = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter']
for mix in mixins:
    for i in range(len(cookies)):
        if mix in [x.strip() for x in cookies.loc[i, 'mixins'].split(',')]:
            cookies.loc[i, mix] = 1

In [None]:
cookies.drop(columns=["mixins"], inplace=True)

In [None]:
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(cookies.corr(), dtype=np.bool))
heatmap = sns.heatmap(cookies.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')

### pre processing- drop NANs and deal with non numerical data


In [None]:
cookies.dtypes

In [None]:
cookies=pd.get_dummies(cookies, columns=['butter type'])


In [None]:
cookies.info()

In [None]:
cookies = cookies.dropna()

In [None]:
cookies.reset_index(drop=True, inplace=True)

### Split data into dependent and independent variables 

In [None]:
X= cookies.drop(columns="quality")
y=cookies.quality

### Preprocessing - scaling 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

### Train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.1, random_state=40)


### get model, fit, make predictions  AND evaluate

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [None]:
predictions  = lm.predict(X_test)

In [None]:
predictions

In [None]:
r2_score(y_test, predictions)

In [None]:
mse = mean_squared_error(y_test, predictions)
print(mse)

In [None]:
import math 
rmse = math.sqrt(mse)
print(rmse)

### Make predictions on the cookies_validate dataset (without labels)

In [None]:
cookies_val = pd.read_csv('cookies_validate.csv')

In [None]:
cookies_val.info()

In [None]:
cookies_val.drop(columns=["aesthetic appeal", "diameter"], inplace=True)

In [None]:
cookies_val['chocolate'] = 0
cookies_val['raisins'] = 0
cookies_val['oats'] = 0
cookies_val['nuts'] = 0
cookies_val['peanut butter'] = 0

In [None]:
mixins = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter']
for mix in mixins:
    for i in range(len(cookies_val)):
        if mix in [x.strip() for x in cookies_val.loc[i, 'mixins'].split(',')]:
            cookies_val.loc[i, mix] = 1
            
cookies_val.drop(columns=["mixins"], inplace=True)

In [None]:
cookies_val.drop(columns= 'id', inplace=True)

In [None]:
cookies_val=pd.get_dummies(cookies_val, columns=['butter type'])

In [None]:
cookies_val.info()

In [None]:
X_val = cookies_val.drop(columns="quality")

In [None]:
X_val_scaled = scaler.fit_transform(X_val)

In [None]:
predictions_val=lm.predict(X_val_scaled)

In [None]:
predictions_val

### assess predictions against the test data (with labels)

In [None]:
cookies_test = pd.read_csv('cookies_test.csv')

In [None]:
y_real = cookies_test["quality"]

In [None]:
y_real.describe()

In [None]:
mean_squared_error(y_real, predictions_val, squared = False)

### Can we get more accuracy with an ensemble method?

#### Task 4 : Add a random forest regressor model to try to get a more accurate score. 

You can follow https://www.geeksforgeeks.org/random-forest-regression-in-python/
or the sklearn_documentation https://scikit-learn.org/stable/search.html?q=random+forest
to import the regressor and fit it to your data, before running the cells below which will calculate your new RMSE accuracy score

In [None]:
#Task 4 your code: 


In [None]:
#Task 4 your code: 

In [None]:
#Task 4 your code: 

In [None]:
#Task 4 your code: 

### Evaluate random forest accuracy 

RMSE accuracy Score - depending on what you named your model and predictions this code may need to be adapted

ie if your model is called 'new model' and you have created a prediction from this model 'predictions_new' then edit the code block below as: 
    
    mean_squared_error(y_real, predictions_new, squared = False)
    

In [None]:
#EDIT as appropriate to get your RMSE 

mean_squared_error(y_real, predictions_val, squared = False)