# Working a Text-Based Data Science Problem

## Agenda

1. Reading in and exploring the data
2. Feature engineering
3. Model evaluation using train_test_split and cross_val_score
4. Making predictions for new data
5. Searching for optimal tuning parameters using GridSearchCV
6. Extracting features from text using CountVectorizer
7. Proper cross-validation using Pipeline
8. Combining GridSearchCV with Pipeline
9. Efficiently searching for tuning parameters using RandomizedSearchCV
10. Ensembling
11. Locating groups of similar cuisines
12. Model stacking

In [2]:
# use print only as a function
from __future__ import print_function

## Part 1: Reading in and exploring the data

In [3]:
import pandas as pd
train = pd.read_json('data/train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [None]:
train.shape

In [None]:
# count the number of null values in each column
train.isnull().sum()

In [None]:
train.dtypes

In [None]:
# select row 0, column 'ingredients'
train.loc[0, 'ingredients']

In [None]:
# contents are stored as a list of strings, not as a string
type(train.loc[0, 'ingredients'])

In [None]:
# examine the class distribution
train.cuisine.value_counts()

## Part 2: Feature engineering

In [None]:
# count the number of ingredients in each recipe
train['num_ingredients'] = train.ingredients.apply(len)
train.head()

In [None]:
# for each cuisine, calculate the mean number of ingredients
train.groupby('cuisine').num_ingredients.mean()

In [None]:
# for each cuisine, "describe" the number of ingredients (and unstack into a DataFrame)
#train.groupby('cuisine').num_ingredients.describe()
train.groupby('cuisine').num_ingredients.describe().unstack()

In [None]:
# allow plots to appear in the notebook
%matplotlib inline

In [None]:
# box plot of number ingredients for each cuisine
train.boxplot('num_ingredients', by='cuisine')

In [None]:
import numpy as np

In [None]:
# calculate the mean ingredient length for each recipe
train['ingredient_length'] = train.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
train.head()

In [None]:
# box plot of ingredient length for each cuisine
train.boxplot('ingredient_length', by='cuisine')

In [None]:
# define a function that accepts a DataFrame and adds new features
def make_features(df):
    df['num_ingredients'] = df.ingredients.apply(len)
    df['ingredient_length'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    return df

In [None]:
# check that the function works
train = make_features(pd.read_json('data/train.json'))
train.head()

## Part 3: Model evaluation using train_test_split and cross_val_score

In [None]:
# define X and y
feature_cols = ['num_ingredients', 'ingredient_length']
X = train[feature_cols]
y = train.cuisine

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# use KNN with K=100
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=100)

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# make class predictions for the testing set
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)

In [None]:
# check the classification accuracy of KNN's predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# use 5-fold cross-validation instead of train/test split
from sklearn.cross_validation import cross_val_score
cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()

In [None]:
# calculate the null accuracy
y.value_counts().head(1) / len(y)

## Part 4: Making predictions for new data

In [None]:
# read in test.json and add the additional features
new = make_features(pd.read_json('data/test.json'))
new.head()

In [None]:
# train KNN on all of the data
knn.fit(X, y)

In [None]:
# create a DataFrame of the relevant columns from the new data
X_new = new[feature_cols]
X_new.head()

In [None]:
# make class predictions for the new data
new_pred_class_knn = knn.predict(X_new)
new_pred_class_knn

In [None]:
# calculate predicted probabilities for the new data (for use with ensembling)
new_pred_prob_knn = knn.predict_proba(X_new)
new_pred_prob_knn.shape

In [None]:
# create a DataFrame that only contains the IDs and predicted classes for the new data
pd.DataFrame({'id':new.id, 'cuisine':new_pred_class_knn}).set_index('id').head()

In [None]:
# create a submission file from that DataFrame (score: 0.21742)
pd.DataFrame({'id':new.id, 'cuisine':new_pred_class_knn}).set_index('id').to_csv('sub1.csv')

## Part 5: Searching for optimal tuning parameters using GridSearchCV

In [None]:
# recalculate the cross-validated accuracy of KNN with K=100
knn = KNeighborsClassifier(n_neighbors=100)
cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()

In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
# define a "parameter grid" in which the key is the parameter and the value is a list of options to try
param_grid = {}
param_grid['n_neighbors'] = [100, 200, 300]
param_grid

In [None]:
# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

In [None]:
# run the grid search
grid.fit(X, y)

In [None]:
# examine the scores for each parameter option
grid.grid_scores_

In [None]:
# try K=100 to 1000 (by 100)
param_grid = {}
param_grid['n_neighbors'] = range(100, 1001, 100)
param_grid

In [None]:
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

In [None]:
# time the grid search using an IPython "magic function"
%time grid.fit(X, y)

In [None]:
# examine the scores for each parameter option
grid.grid_scores_

In [None]:
# extract only the mean scores
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
grid_mean_scores

In [None]:
# line plot of K value (x-axis) versus accuracy (y-axis)
import matplotlib.pyplot as plt
plt.plot(range(100, 1001, 100), grid_mean_scores)

In [None]:
# print the single best score and parameters that produced that score
print(grid.best_score_)
print(grid.best_params_)

## Part 6: Extracting features from text using CountVectorizer

In [None]:
# reminder: contents are stored as a list of strings, not as a string
train.loc[10, 'ingredients']

In [None]:
# update make_features to create a new column 'ingredients_str'
def make_features(df):
    df['num_ingredients'] = df.ingredients.apply(len)
    df['ingredient_length'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    df['ingredients_str'] = df.ingredients.astype(str)
    return df

In [None]:
# run make_features and check that it worked
train = make_features(train)
train.loc[0, 'ingredients_str']

In [None]:
# define X and y
X = train.ingredients_str
y = train.cuisine

In [None]:
# import and instantiate CountVectorizer (with default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect

In [None]:
# create a document-term matrix from all of the training data
X_dtm = vect.fit_transform(X)
X_dtm.shape

In [None]:
# examine the features that were created
print(vect.get_feature_names()[0:100])

In [None]:
# replace the regex pattern that is used for tokenization
vect = CountVectorizer(token_pattern=r"'([a-z ]+)'")
X_dtm = vect.fit_transform(X)
X_dtm.shape

In [None]:
# examine the features that were created
print(vect.get_feature_names()[0:100])

## Part 7: Proper cross-validation using Pipeline

In [None]:
# import and instantiate Multinomial Naive Bayes (with the default parameters)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# slightly improper cross-validation
cross_val_score(nb, X_dtm, y, cv=5, scoring='accuracy').mean()

In [None]:
# create a "pipeline" of CountVectorizer and MultinomialNB
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

In [None]:
# examine the pipeline steps
pipe.steps

In [None]:
# proper cross-validation with pipeline
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

## Part 8: Combining GridSearchCV with Pipeline

In [None]:
# create a grid of parameters to search (and specify the pipeline step along with the parameter)
param_grid = {}
param_grid['countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['multinomialnb__alpha'] = [0, 0.5, 1]
param_grid
pipe.steps

In [None]:
# pass the pipeline (instead of just the model) to GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
%time grid.fit(X, y)

In [None]:
# examine the scores for each combination of parameters
grid.grid_scores_

In [None]:
print(grid.best_score_)
print(grid.best_params_)

## Part 9: Efficiently searching for tuning parameters using RandomizedSearchCV

In [None]:
from sklearn.grid_search import RandomizedSearchCV

In [None]:
# for any continuous parameters, specify a distribution instead of a list of options
import scipy as sp
param_grid = {}
param_grid['countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['countvectorizer__min_df'] = [1, 2, 3, 4]
param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
param_grid

In [None]:
# set a random seed for sp.stats.uniform
np.random.seed(1)

In [None]:
# run 10 random searches
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=10, random_state=1)
%time rand.fit(X, y)

In [None]:
rand.grid_scores_

In [None]:
print(rand.best_score_)
print(rand.best_params_)

In [None]:
# run make_features on the new data
new = make_features(new)

In [None]:
# define X_new as the ingredient text
X_new = new.ingredients_str

In [None]:
# RandomizedSearchCV (and GridSearchCV) are automatically fit with the best parameters, and can be used to make predictions
new_pred_class_rand = rand.predict(X_new)
new_pred_class_rand

In [None]:
# also calculate predicted probabilities for the new data (for use with ensembling)
new_pred_prob_rand = rand.predict_proba(X_new)
new_pred_prob_rand.shape

In [None]:
# create a submission file (score: 0.75422)
pd.DataFrame({'id':new.id, 'cuisine':new_pred_class_rand}).set_index('id').to_csv('sub2.csv')

## Part 10: Ensembling

In [None]:
# calculate the mean of the predicted probabilities from KNN and RandomizedSearchCV
new_pred_prob_combined = (new_pred_prob_knn + new_pred_prob_rand)/2
new_pred_prob_combined.shape

In [None]:
# create a list of the cuisines in alphabetical order
cuisines = np.sort(train.cuisine.unique())
cuisines

In [None]:
# convert the predicted probabilities into a DataFrame
new_pred_prob_combined = pd.DataFrame(new_pred_prob_combined, columns=cuisines)
new_pred_prob_combined.head()

In [None]:
# for each row, find the column with the highest predicted probability
new_pred_class_combined = new_pred_prob_combined.apply(np.argmax, axis=1)
new_pred_class_combined.head()

In [None]:
# create a submission file (score: 0.75483)
pd.DataFrame({'id':new.id, 'cuisine':new_pred_class_combined}).set_index('id').to_csv('sub3.csv')

## Part 11: Locating groups of similar cuisines

Adapted from this [Stack Overflow question](http://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity/12128777#12128777)

In [None]:
# create a document-term matrix from X using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
X_dtm = vect.fit_transform(X)
X_dtm.shape

In [None]:
train.head(1)

In [None]:
# calculate the cosine similarity between the first recipe and all recipes
cosine_similarities = metrics.pairwise.linear_kernel(X_dtm[0, :], X_dtm).flatten()
cosine_similarities

In [None]:
# calculate the recipe's mean similarity to each cuisine
df = pd.DataFrame({'cuisine':train.cuisine, 'similarity':cosine_similarities})
df[dfN['cuisine']=='greek']

In [None]:
# calculate the recipe's mean similarity to each cuisine
pd.DataFrame({'cuisine':train.cuisine, 'similarity':cosine_similarities}).groupby('cuisine').similarity.mean()

In [None]:
train.loc[1, 'cuisine']

In [None]:
# for each cuisine, count the number of recipes
cuisine_count = {}

# for each cuisine, sum the mean similarity to each cuisine
cuisine_total_similarities = {}

# loop through the first 2000 recipes
for i in range(2):
    
    # save the name of this recipe's cuisine
    current_cuisine = train.loc[i, 'cuisine']

    # calculate the similarity between this recipe and all other recipes
    recipe_similarities = metrics.pairwise.linear_kernel(X_dtm[i, :], X_dtm).flatten()
    
    # calculate the recipe's mean similarity to each cuisine
    cuisine_similarities = pd.DataFrame({'cuisine':train.cuisine, 'similarity':recipe_similarities}).groupby('cuisine').similarity.mean()
    
    # update the cuisine count and add the mean similarities
    if current_cuisine not in cuisine_count:
        cuisine_count[current_cuisine] = 1
        cuisine_total_similarities[current_cuisine] = cuisine_similarities
    else:
        cuisine_count[current_cuisine] += 1
        cuisine_total_similarities[current_cuisine] += cuisine_similarities

In [None]:
cuisine_count

In [None]:
cuisine_total_similarities = pd.DataFrame(cuisine_total_similarities)
cuisine_total_similarities

In [None]:
# create a copy of the DataFrame
cuisine_mean_similarities = cuisine_total_similarities.copy()

In [None]:
# normalize each column by the recipe count for each cuisine
for col in cuisine_mean_similarities.columns:
    cuisine_mean_similarities[col] /= cuisine_count[col]
cuisine_mean_similarities

In [None]:
# display the mean similarities as a heatmap
import seaborn as sns
sns.heatmap(cuisine_mean_similarities)

In [None]:
# display the correlation matrix as a heatmap
sns.heatmap(cuisine_mean_similarities.corr())

**Hand-selected cuisine groups:**

1. chinese, filipino, japanese, korean, thai, vietnamese
2. british, french, irish, russian, southern_us
3. greek, italian, moroccan, spanish
4. brazilian, cajun_creole, indian, jamaican, mexican

## Part 12: Model stacking

In [None]:
# define a dictionary that maps each cuisine to its group number
cuisine_to_group = {}
cuisine_to_group['chinese'] = 1
cuisine_to_group['filipino'] = 1
cuisine_to_group['japanese'] = 1
cuisine_to_group['korean'] = 1
cuisine_to_group['thai'] = 1
cuisine_to_group['vietnamese'] = 1
cuisine_to_group['british'] = 2
cuisine_to_group['french'] = 2
cuisine_to_group['irish'] = 2
cuisine_to_group['russian'] = 2
cuisine_to_group['southern_us'] = 2
cuisine_to_group['greek'] = 3
cuisine_to_group['italian'] = 3
cuisine_to_group['moroccan'] = 3
cuisine_to_group['spanish'] = 3
cuisine_to_group['brazilian'] = 4
cuisine_to_group['cajun_creole'] = 4
cuisine_to_group['indian'] = 4
cuisine_to_group['jamaican'] = 4
cuisine_to_group['mexican'] = 4

In [None]:
# map the cuisines to their group numbers
train['group'] = train.cuisine.map(cuisine_to_group)
train.head()

In [None]:
# check that all recipes were assigned a group
train.group.isnull().sum()

In [None]:
# define a "global" X and y, then define an X and y for each group
X = train.ingredients_str
y = train.group
X1 = train.loc[train.group==1, 'ingredients_str']
y1 = train.loc[train.group==1, 'cuisine']
X2 = train.loc[train.group==2, 'ingredients_str']
y2 = train.loc[train.group==2, 'cuisine']
X3 = train.loc[train.group==3, 'ingredients_str']
y3 = train.loc[train.group==3, 'cuisine']
X4 = train.loc[train.group==4, 'ingredients_str']
y4 = train.loc[train.group==4, 'cuisine']

In [None]:
# define a "global" pipeline, then define a pipeline for each group
pipe_main = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_1 = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_2 = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_3 = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_4 = make_pipeline(CountVectorizer(), MultinomialNB())

In [None]:
# calculate the cross-validated accuracy for each pipeline
print(cross_val_score(pipe_main, X, y, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_1, X1, y1, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_2, X2, y2, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_3, X3, y3, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_4, X4, y4, cv=5, scoring='accuracy').mean())

In [None]:
# fit each pipeline with the relevant X and y
pipe_main.fit(X, y)
pipe_1.fit(X1, y1)
pipe_2.fit(X2, y2)
pipe_3.fit(X3, y3)
pipe_4.fit(X4, y4)

In [None]:
# for new data, first make group predictions
X_new = new.ingredients_str
new_pred_group = pipe_main.predict(X_new)
new_pred_group

In [None]:
# then within each predicted group, make class predictions
new_pred_class_1 = pipe_1.predict(X_new[new_pred_group==1])
new_pred_class_2 = pipe_2.predict(X_new[new_pred_group==2])
new_pred_class_3 = pipe_3.predict(X_new[new_pred_group==3])
new_pred_class_4 = pipe_4.predict(X_new[new_pred_group==4])
print(new_pred_class_1)
print(new_pred_class_2)
print(new_pred_class_3)
print(new_pred_class_4)

In [None]:
# add the class predictions to the DataFrame of new data
new.loc[new_pred_group==1, 'pred_class'] = new_pred_class_1
new.loc[new_pred_group==2, 'pred_class'] = new_pred_class_2
new.loc[new_pred_group==3, 'pred_class'] = new_pred_class_3
new.loc[new_pred_group==4, 'pred_class'] = new_pred_class_4

In [None]:
new.head()

In [None]:
# create a submission file (score: 0.70475)
pd.DataFrame({'id':new.id, 'cuisine':new.pred_class}).set_index('id').to_csv('sub4.csv')