In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
%matplotlib inline

In [3]:
raw_data = pd.read_csv('svmtest.csv')
raw_data.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.svm import SVR
svr = SVR()
X = raw_data.drop(['rating', 'title', 'calories', 'protein', 'fat', 'sodium'], 1)
Y = raw_data.rating
svr.fit(X,Y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [5]:
svr.score(X, Y)

-0.018605587418357272

In [6]:
from sklearn.model_selection import cross_val_score
cross_val_score(svr, X, Y, cv=5)

array([-0.05425375, -0.00845367, -0.0435314 , -0.07015603, -0.03768809])

In [7]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X2 = raw_data.dropna()

#Add the calories, fat, sodium, and protein
X3 = X2.drop(['rating', 'title'], 1)

#Python RFC requires numerical predictors, and is complaining that raw_data['rating'] is continuous, so 
#per the challenge requirement, I will convert the rating into a 'good / bad' 1/0 scale where anything above 3.75 is good

Y = X2['rating'].apply(lambda x: 1 if x >= 3.75 else 0)
X = pd.get_dummies(X3)

cross_val_score(rfc, X, Y, cv=5)

array([0.79931389, 0.78865979, 0.78178694, 0.79896907, 0.8141136 ])

In [12]:
df = pd.DataFrame()
df['imp']= rfc.fit(X, Y).feature_importances_
df['features'] = X.columns

In [14]:
df2 = df.sort_values(by='imp', ascending=False)

In [15]:
df2.head()

Unnamed: 0,imp,features
0,0.060748,calories
3,0.053916,sodium
2,0.050063,fat
1,0.042957,protein
61,0.014515,bon appétit


In [16]:
important_features = df2['features'].iloc[0:30]

In [23]:
important_features

['calories',
 'sodium',
 'fat',
 'protein',
 'bon appétit',
 'gourmet',
 'quick & easy',
 'fall',
 'wheat/gluten-free',
 'summer',
 'vegetarian',
 'non-alcoholic',
 'bake',
 'winter',
 'condiment/spread',
 'vegetable',
 'spring',
 'kosher',
 'kid-friendly',
 'onion',
 'peanut free',
 'cocktail party',
 'leafy green',
 'herb',
 'alcoholic',
 'salad',
 'pescatarian',
 'tomato',
 'drink',
 'house & garden']

In [17]:
important_features = important_features.tolist()

In [18]:
X2 = raw_data.dropna()

#Add the calories, fat, sodium, and protein
X3 = X2.drop(['rating', 'title'], 1)

In [19]:
#Create new dataframe with just the features we identified from random forest

df_new = pd.DataFrame()

for feat in important_features:
    df_new[feat] = X3[feat]

In [20]:
#Import SVC to classify rather than regress


from sklearn.svm import SVC
svc = SVC()
X = df_new
Y = X2['rating'].apply(lambda x: 1 if x >= 3.75 else 0)
svc.fit(X,Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
X = df_new
Y = X2['rating'].apply(lambda x: 1 if x >= 3.75 else 0)
svc.score(X,Y)

0.9742268041237113

In [22]:
from sklearn.model_selection import cross_val_score
cross_val_score(svc, X, Y, cv=5)

array([0.81132075, 0.81786942, 0.80756014, 0.80756014, 0.80895009])