In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [107]:
raw_data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [111]:
raw_data.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey,Good,Poor
0,"Lentil, Apple, and Turkey Wrap",Poor,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1
1,Boudin Blanc Terrine with Red Onion Confit,Good,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
2,Potato and Fennel Soup Hodge,Poor,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,Mahi-Mahi in Tomato Olive Sauce,Good,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,Spinach Noodle Casserole,Poor,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


In [65]:
raw_data.rating.describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [66]:
raw_data.rating.value_counts()

4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64

Looking at the value counts of the different ratings we see the vast majority lie at 3.750 or greater. This seems like a good cutoff for the binary classifier. Anything less than or equal to 3.750 will be a poor recipe and anything greater than this will be a good recipe. 

In [108]:
# Change the target to a binary outcome
raw_data.rating.replace(to_replace = [4.375, 5.0], value = 'Good', inplace = True)
raw_data.rating.replace(to_replace = [0.0, 3.125, 3.75, 2.5, 1.25, 1.875], value = 'Poor', inplace = True)

In [118]:
# Reexamine value counts
raw_data.rating.value_counts()

Good    10738
Poor     9314
Name: rating, dtype: int64

In [110]:
# Add the dummy variables to the data frame 
raw_data = pd.concat([raw_data, pd.get_dummies(raw_data['rating'])], axis = 1)

In [119]:
# Count null percentages 
raw_data.isnull().sum() * 100/raw_data.count()

title                     0.000000
rating                    0.000000
calories                 25.836210
protein                  26.192574
fat                      26.359569
sodium                   25.852005
#cakeweek                 0.000000
#wasteless                0.000000
22-minute meals           0.000000
3-ingredient recipes      0.000000
30 days of groceries      0.000000
advance prep required     0.000000
alabama                   0.000000
alaska                    0.000000
alcoholic                 0.000000
almond                    0.000000
amaretto                  0.000000
anchovy                   0.000000
anise                     0.000000
anniversary               0.000000
anthony bourdain          0.000000
aperitif                  0.000000
appetizer                 0.000000
apple                     0.000000
apple juice               0.000000
apricot                   0.000000
arizona                   0.000000
artichoke                 0.000000
arugula             

Over 25% of the nutritional information is missing, that is a lot to clean up so for now we will exclude it from the model. If improvements need to be made we can come back to this. 

In [113]:
# Find the 30 best features
sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = raw_data.iloc[:, 6:]
y = raw_data.iloc[:, 1]

bestfeatures = SelectKBest(score_func = chi2, k = 32)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
scores = pd.concat([dfcolumns, dfscores], axis = 1)
scores.columns = ['Specs', 'Score']
print(scores.nlargest(32, 'Score'))

                 Specs         Score
675               Poor  10738.000000
674               Good   9314.000000
277     house & garden    168.736737
186              drink    130.854678
8            alcoholic    114.406372
57         bon appétit    100.806123
235                gin    100.128167
520              roast     87.120913
616       thanksgiving     83.703405
134     cocktail party     76.283693
179             dinner     67.082465
580             spirit     64.501838
124          christmas     56.284523
50             bitters     55.865868
574           soy free     51.798720
453        peanut free     50.641030
251     grill/barbecue     49.678538
343            low fat     46.345607
30        backyard bbq     43.456861
133           cocktail     41.317475
204               fall     36.940820
250              grill     35.906342
593  stuffing/dressing     35.891319
262      harpercollins     35.737446
446              pasta     35.436904
526                rum     34.444240
2

In [117]:
# Select top features for target
target = raw_data[['bon appétit', 'peanut free', 'soy free', 'tree nut free', 'vegan', 'grill', 'grill/barbecue', 'dinner',
                   'house & garden', 'drink', 'alcoholic', 'gin', 'roast', 'thanksgiving', 'cocktail party', 'christmas',
                   'spirit', 'bitters', 'cocktail', 'backyard bbq', 'low fat', 'fall', 'stuffing/dressing', 'pasta', 'rum',     
                   'harpercollins', 'goat cheese', 'stir-fry', 'meat', 'father\'s day']]


### Support Vector Machine Model

In [125]:
from sklearn.svm import SVR
svr = SVR()
X = target.sample(frac=0.3, replace=True, random_state=1)
Y = raw_data.Good.sample(frac=0.3, replace=True, random_state=1)
svr.fit(X,Y)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [127]:
svr.score(X, Y)

-0.1474697961207876

In [122]:
from sklearn.model_selection import cross_val_score
cross_val_score(svr, X, Y, cv=5)



array([-0.17459752, -0.13355313, -0.14252552, -0.2093981 , -0.12044203])