### Convert the Yelp Dataset from json format to csv.

In [None]:
import collections
import csv
import argparse
import simplejson as json


def read_and_write_file(json_file_path, csv_file_path, column_names):
    #Read in the json dataset file and write it out to a csv file, given the column names.
    with open(csv_file_path, 'wb+') as fout:
        csv_file = csv.writer(fout)
        csv_file.writerow(list(column_names))
        with open(json_file_path) as fin:
            for line in fin:
                line_contents = json.loads(line)
                csv_file.writerow(get_row(line_contents, column_names))

def get_superset_of_column_names_from_file(json_file_path):
    #Read in the json dataset file and return the superset of column names.
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                    set(get_column_names(line_contents).keys())
                    )
    return column_names

def get_column_names(line_contents, parent_key=''):
    #Return a list of flattened key names given a dict
    column_names = []
    for k, v in line_contents.iteritems():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                    get_column_names(v, column_name).items()
                    )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def get_nested_value(d, key):
    #Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`
    
    
    if '.' not in key:
        if key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split('.', 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
    #Return a csv compatible row given column names and a dict.
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
                        line_contents,
                        column_name,
                        )
        if isinstance(line_value, unicode):
            row.append('{0}'.format(line_value.encode('utf-8')))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

if __name__ == '__main__':
    #Convert a yelp dataset file from json to csv.

    parser = argparse.ArgumentParser(
            description='Convert Yelp Dataset from JSON format to CSV.',
            )

    parser.add_argument(
            'json_file',
            type=str,
            help='The json file to convert.',
            )

    args = parser.parse_args()

    json_file = args.json_file
    csv_file = '{0}.csv'.format(json_file.split('.json')[0])

    column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)

## Data Reduction: 

### Extract the data pertaining to "Restaurants"
Yelp dataset contains the data of various business categories like University, Hospital, Malls, Restaurants, Automobiles etc. Here as we are analysing the "Restaurants" data, the tuples of Restaurant category is extracted and stored as a separate csv file  

In [None]:
import re
import csv



fieldnames = ["attributes.Ambience.divey", "attributes.RestaurantsDelivery","attributes.DogsAllowed","postal_code","hours.Thursday","attributes.HairSpecializesIn.coloring","attributes.BestNights.sunday","attributes.BYOB","attributes.AgesAllowed","attributes.Music.video","hours.Friday","latitude","attributes.Alcohol","attributes.Ambience.classy","attributes.RestaurantsTableService","business_id","attributes.Ambience.touristy","attributes.RestaurantsCounterService","attributes.Corkage","attributes.RestaurantsGoodForGroups","categories","name","attributes.BusinessAcceptsBitcoin","attributes.HappyHour","attributes.WheelchairAccessible","attributes.Ambience.hipster","attributes.BusinessAcceptsCreditCards","is_open","attributes.DietaryRestrictions.vegetarian","attributes.Music.live","attributes.Music.background_music","neighborhood","attributes.BusinessParking.lot","attributes.Music.karaoke","review_count","attributes.GoodForMeal.breakfast","attributes.NoiseLevel","attributes.HairSpecializesIn.perms","state","attributes.DriveThru","attributes.HasTV","attributes.GoodForMeal.dinner","attributes.BusinessParking.street","address","attributes.RestaurantsAttire","hours.Sunday","attributes.BestNights.tuesday","attributes.AcceptsInsurance","attributes.BestNights.wednesday","hours.Wednesday","attributes.HairSpecializesIn.kids","attributes.Open24Hours","attributes.Ambience.trendy","attributes.CoatCheck","hours.Monday","attributes.HairSpecializesIn.straightperms","city","attributes.HairSpecializesIn.curly","attributes.Music.no_music","hours.Tuesday","attributes.HairSpecializesIn.africanamerican","stars","attributes.RestaurantsPriceRange2","attributes.Ambience.intimate","attributes.GoodForMeal.latenight","attributes.GoodForMeal.dessert","attributes.BusinessParking.validated","attributes.GoodForMeal.lunch","attributes.GoodForKids","attributes.DietaryRestrictions.soy-free","attributes.GoodForMeal.brunch","attributes.BusinessParking.valet","longitude","attributes.DietaryRestrictions.gluten-free","attributes.BYOBCorkage","attributes.BusinessParking.garage","attributes.BestNights.friday","hours.Saturday","attributes.Music.dj","attributes.HairSpecializesIn.extensions","attributes.BestNights.saturday","attributes.Ambience.casual","attributes.BestNights.thursday","attributes.BestNights.monday","attributes.HairSpecializesIn.asian","attributes.DietaryRestrictions.kosher","attributes.WiFi","attributes.Smoking","attributes.DietaryRestrictions.halal","attributes.GoodForDancing","attributes.ByAppointmentOnly","attributes.Caters","attributes.RestaurantsReservations","attributes.DietaryRestrictions.dairy-free","attributes.DietaryRestrictions.vegan","attributes.Ambience.romantic","attributes.Music.jukebox","attributes.Ambience.upscale","attributes.RestaurantsTakeOut","attributes.BikeParking","attributes.OutdoorSeating"]
with open('business.csv', 'r') as csvfile, open('restaurant_business.csv', 'w') as outputfile:
    reader = csv.DictReader(csvfile, fieldnames=fieldnames)
    writer = csv.DictWriter(outputfile, fieldnames=fieldnames)
    for row in reader:
        if 'Food' in row['categories'] or 'Restaurants' in row['categories']:
            writer.writerow(row)

In [None]:
import csv

fieldnames_bis = ["attributes.Ambience.divey", "attributes.RestaurantsDelivery","attributes.DogsAllowed","postal_code","hours.Thursday","attributes.HairSpecializesIn.coloring","attributes.BestNights.sunday","attributes.BYOB","attributes.AgesAllowed","attributes.Music.video","hours.Friday","latitude","attributes.Alcohol","attributes.Ambience.classy","attributes.RestaurantsTableService","business_id","attributes.Ambience.touristy","attributes.RestaurantsCounterService","attributes.Corkage","attributes.RestaurantsGoodForGroups","categories","name","attributes.BusinessAcceptsBitcoin","attributes.HappyHour","attributes.WheelchairAccessible","attributes.Ambience.hipster","attributes.BusinessAcceptsCreditCards","is_open","attributes.DietaryRestrictions.vegetarian","attributes.Music.live","attributes.Music.background_music","neighborhood","attributes.BusinessParking.lot","attributes.Music.karaoke","review_count","attributes.GoodForMeal.breakfast","attributes.NoiseLevel","attributes.HairSpecializesIn.perms","state","attributes.DriveThru","attributes.HasTV","attributes.GoodForMeal.dinner","attributes.BusinessParking.street","address","attributes.RestaurantsAttire","hours.Sunday","attributes.BestNights.tuesday","attributes.AcceptsInsurance","attributes.BestNights.wednesday","hours.Wednesday","attributes.HairSpecializesIn.kids","attributes.Open24Hours","attributes.Ambience.trendy","attributes.CoatCheck","hours.Monday","attributes.HairSpecializesIn.straightperms","city","attributes.HairSpecializesIn.curly","attributes.Music.no_music","hours.Tuesday","attributes.HairSpecializesIn.africanamerican","stars","attributes.RestaurantsPriceRange2","attributes.Ambience.intimate","attributes.GoodForMeal.latenight","attributes.GoodForMeal.dessert","attributes.BusinessParking.validated","attributes.GoodForMeal.lunch","attributes.GoodForKids","attributes.DietaryRestrictions.soy-free","attributes.GoodForMeal.brunch","attributes.BusinessParking.valet","longitude","attributes.DietaryRestrictions.gluten-free","attributes.BYOBCorkage","attributes.BusinessParking.garage","attributes.BestNights.friday","hours.Saturday","attributes.Music.dj","attributes.HairSpecializesIn.extensions","attributes.BestNights.saturday","attributes.Ambience.casual","attributes.BestNights.thursday","attributes.BestNights.monday","attributes.HairSpecializesIn.asian","attributes.DietaryRestrictions.kosher","attributes.WiFi","attributes.Smoking","attributes.DietaryRestrictions.halal","attributes.GoodForDancing","attributes.ByAppointmentOnly","attributes.Caters","attributes.RestaurantsReservations","attributes.DietaryRestrictions.dairy-free","attributes.DietaryRestrictions.vegan","attributes.Ambience.romantic","attributes.Music.jukebox","attributes.Ambience.upscale","attributes.RestaurantsTakeOut","attributes.BikeParking","attributes.OutdoorSeating"]
fieldnames_reviews = ["funny","user_id","review_id","text","business_id","stars","date","useful","cool"]
with open('restaurant_business.csv', 'r') as csvfile, open('review.csv','r') as reviewcsv, open('restaurant_reviews.csv', 'w') as output_rev:
    reader_bis = csv.DictReader(csvfile, fieldnames=fieldnames_bis)
    reader_rev = csv.DictReader(reviewcsv, fieldnames=fieldnames_reviews)
    writer = csv.DictWriter(output_rev, fieldnames=fieldnames_reviews)
    for row in reader_bis:
        for row_reviews in reader_rev:
            if row_reviews['business_id'] == row['business_id']:
                writer.writerow(row_reviews)

### Read the file restaurant_business.csv

In [None]:
import pandas as pd

headers = ["attributes.Ambience.divey", "attributes.RestaurantsDelivery","attributes.DogsAllowed","postal_code","hours.Thursday","attributes.HairSpecializesIn.coloring","attributes.BestNights.sunday","attributes.BYOB","attributes.AgesAllowed","attributes.Music.video","hours.Friday","latitude","attributes.Alcohol","attributes.Ambience.classy","attributes.RestaurantsTableService","business_id","attributes.Ambience.touristy","attributes.RestaurantsCounterService","attributes.Corkage","attributes.RestaurantsGoodForGroups","categories","name","attributes.BusinessAcceptsBitcoin","attributes.HappyHour","attributes.WheelchairAccessible","attributes.Ambience.hipster","attributes.BusinessAcceptsCreditCards","is_open","attributes.DietaryRestrictions.vegetarian","attributes.Music.live","attributes.Music.background_music","neighborhood","attributes.BusinessParking.lot","attributes.Music.karaoke","review_count","attributes.GoodForMeal.breakfast","attributes.NoiseLevel","attributes.HairSpecializesIn.perms","state","attributes.DriveThru","attributes.HasTV","attributes.GoodForMeal.dinner","attributes.BusinessParking.street","address","attributes.RestaurantsAttire","hours.Sunday","attributes.BestNights.tuesday","attributes.AcceptsInsurance","attributes.BestNights.wednesday","hours.Wednesday","attributes.HairSpecializesIn.kids","attributes.Open24Hours","attributes.Ambience.trendy","attributes.CoatCheck","hours.Monday","attributes.HairSpecializesIn.straightperms","city","attributes.HairSpecializesIn.curly","attributes.Music.no_music","hours.Tuesday","attributes.HairSpecializesIn.africanamerican","stars","attributes.RestaurantsPriceRange2","attributes.Ambience.intimate","attributes.GoodForMeal.latenight","attributes.GoodForMeal.dessert","attributes.BusinessParking.validated","attributes.GoodForMeal.lunch","attributes.GoodForKids","attributes.DietaryRestrictions.soy-free","attributes.GoodForMeal.brunch","attributes.BusinessParking.valet","longitude","attributes.DietaryRestrictions.gluten-free","attributes.BYOBCorkage","attributes.BusinessParking.garage","attributes.BestNights.friday","hours.Saturday","attributes.Music.dj","attributes.HairSpecializesIn.extensions","attributes.BestNights.saturday","attributes.Ambience.casual","attributes.BestNights.thursday","attributes.BestNights.monday","attributes.HairSpecializesIn.asian","attributes.DietaryRestrictions.kosher","attributes.WiFi","attributes.Smoking","attributes.DietaryRestrictions.halal","attributes.GoodForDancing","attributes.ByAppointmentOnly","attributes.Caters","attributes.RestaurantsReservations","attributes.DietaryRestrictions.dairy-free","attributes.DietaryRestrictions.vegan","attributes.Ambience.romantic","attributes.Music.jukebox","attributes.Ambience.upscale","attributes.RestaurantsTakeOut","attributes.BikeParking","attributes.OutdoorSeating"]
data_business = pd.read_csv("restaurant_business.csv", names = headers)
data_business.head()

### Read the file restaurant_reviews.csv

In [None]:
data_review = pd.read_csv("reviews_6000.csv")
data_review.head()
data_review.shape

## Data Integration:
##### Concatenate the data based on the common attribute "business_id" to get the reviews of the restaurants

In [None]:
data_merge = pd.merge(data_review, data_business, on='business_id', how='inner')
data_merge.head()

In [None]:
data = data_merge[["review_id", "text"]]
data.head()
data.shape()

## Task 1
In this task, we extract the sentences from the review text and pre-process it. The preprocessed reviews are represented
by 2 techniques such as 
Bag Of Words and TFIDF. The classifier is built for each of these techniques. 

### Preprocessing of text reviews:

In [None]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from iteration_utilities import deepflatten
from nltk.stem import RegexpStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re


def review_to_words( i, raw_review, dict_ids, j ):
    
    # 1. Remove tags
    review_text = BeautifulSoup(raw_review).get_text() 
   
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z.]", " ", review_text) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # 4. Remove the stop words:
    stop = []
    stop_word = stopwords.words("english")
    for i in range(len(stop_word)):
        if (stop_word[i] != "no") and (stop_word[i] != "not") and (stop_word[i] != "nor"):
            stop.append(stop_word[i])
            
    stop_words = set(stop)
    
    meaningful_words = [w for w in words if not w in stop_words]
            
    # 5. Stemming:    
    st = RegexpStemmer('s$')
    stem = []
    for w in meaningful_words:
        stem.append(st.stem(w).split())
    
    stemmed_words = list(deepflatten(stem, depth=1))

    # 6. Join the words back into one string separated by space and return the result.
    review = " ".join( stemmed_words )
    
        
    # 7. Split based on period
    reviews = []
    reviews.append(review.split("."))
    review_ids = list(deepflatten(reviews, depth=1))
    
    for k in range(len(review_ids)):
        dict_ids[j] = data["review_id"][i], review_ids[k]
        j=j+1
        
    return j, review.split (".")


In [None]:
dict_ids = {}
j = 0

# Get the number of reviews based on the dataframe column size
num_reviews = data["text"].size

# Initialize an empty list to hold the clean reviews or preprocessed reviews
clean_reviews = []
          
for i in range(0, num_reviews):
    j, clean_review = review_to_words( i, data["text"][i], dict_ids, j )
    clean_reviews.append( clean_review )
    
 


In [None]:
from iteration_utilities import deepflatten

clean = list(deepflatten(clean_reviews, depth=1))


### Take only the useful reviews from the cleaned reviews

In [None]:
#Food words are taken from the reviews. The sentences which contain these words are considered from the cleaned reviews.

words =  ['meal', 'tea', 'coffee', 'salad', 'veggie', 'rice', 'teriyaki', 'chicken', 'carrot', 'tomato', 'beef', 
          'mushroom', 'chilli', 'shrimp', 'egg', 'broccoli', 'curry', 'tofu', 'lettuce', 'puff', 'noodle', 'meat', 
          'roll', 'wonton', 'bread', 'burger', 'bacon','bun', 'sprout','boba', 'pattie', 'bamboo', 'berry', 'bruschetta',
          'cheeto', 'chocolate', 'shake', 'cereal', 'nugget', 'corn', 'crevette','cocktail','daikon', 'donburi','dumpling',
          'doughnut', 'frie','fish', 'sandwich', 'cake','taco', 'goyuza', 'hamburger', 'haggi', 'kimmari','kimari', 
          'kimtchi', 'kimchi', 'lobster','lemonade', 'margarita','milkshake','mochi','milk', 'nacho', 'onigiri','pattie',
          'pasta','popcorn', 'pudding','pizza', 'rangoon', 'slider','mustard', 'sausage','spaghetti', 'sappora', 
          'salmon', 'seaweed','scallop','sansotei','shellfish', 'twinkie', 'zucchini', 'schnitzel', 'macaron', 'brulee', 
          'steak', 'baco', 'mimosa', 'salmon', 'calamari', 'gnocchi', 'crab', 'chilaquile', 'pancake', 'cocktail', 
          'buttermilk', 'caesar', 'buffalo', 'broccolini', 'pierogi', 'marshmallow', 'caramel', 'milkshake', 'carolina', 
          'coleslaw', 'coke', 'cheeseburger', 'banzai', 'swenson', 'bacon', 'hankerin', 'cookie', 'bun', 'ketchup', 
          'cheesecake', 'brownie',  'onion','anglaise','apricot','avocado','arroncini','bolognese','burrito','beet',
          'buffalo','bread','burrata','bacon','blueberry','boar','bun','biscotti','caramel','chimichurri','chestnut',
          'chorizo','chipotle','corn','chicken','croquette','cheese','caper','crab','chimi','cotta','clam','cream',
          'creme','cheesecake','cod','charcuterie','cider','egg','enchilada','fennel','fajita','fontina','fish','fajita',
          'flan','fig','grapefruit','guacamole','gremolata','goat','honey','halibut','haddock','jam','kale','lamb',
          'lobster','linguine','latte','mojito','mascarpone','mignonette','marinara','mushroom','mimosa','molacajete',
          'musse','nutella','oyster','oreo','orange','oolong','parmesan','pecorino','pasta','pumpkin','polenta','pork',
          'peanut','panna','pudding','pistachio','potato','pastry','quesadilla', 'ribollita','radish','risotto',
          'ratatouille','rosti','ramekin','ragu','rhubarb','ramekin','salsa','seafood','scotch','steak','sprout','salumi',
          'striploin','strawberry','shortrib','sandwiche','smoothie','tequila','tortilla','toritilla','tartare','toast',
          'tamale','tenderloin','tagliatelle','toffee','tart', 'vanilla','wonton'
         ]

useful_reviews = []
useful_count = 0
dict_review = {}

for i in range(0, len(clean)):    
    for j in range(0, len(words)):
        space = clean[i].split (" ")  
        if words[j] in space:
            useful_reviews.append( clean[i] )
            break

for i in range(len(useful_reviews)):
    for k in range(len(dict_ids)):
        if useful_reviews[i] in dict_ids[k][1]:
            dict_review[i] = dict_ids[k]
            

### Generate the training labels

Afinn Score provides the score in the range of (-5, 5) for every words based on the sentiment.  The total score of the sentence is the sum of afinn score for every word in the sentence. Positive score indicates that review is positive, else it is negative. 

In [None]:
from afinn import Afinn
afinn = Afinn()

train_labels = []
train_data = useful_reviews

for i in range(0, len(train_data)):
    score = afinn.score(train_data[i])
    
    res = 'pos' if score > 0 else 'neg'
        
    train_labels.append(res)

In [None]:
# Save the training reviews sentiments in a dictionary 'dict_sentiments'
dict_sentiments = {}

for i in range(0, len(train_data)):
    dict_sentiments[i] = dict_review[i], train_labels[i]
   

#### Giving Numerical Labels

In [None]:
from afinn import Afinn
afinn = Afinn()

train_labels_num = []
train_data = useful_reviews

for i in range(0, len(train_data)):
    score = afinn.score(train_data[i])
    
    res = 1 if score > 0 else 0
        
    train_labels_num.append(res)


## Data Transformation

###  Data Representation 1: Bag Of Words 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data)

### Learning Models for bag of words representation
### 1] SVM
### Parameter Optimisation

In [None]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
svm = SVC()

param_grid = { "C"      : [1, 6, 10],
               "kernel" : ["rbf", "linear", "poly", "sigmoid"],
               "degree" : [2, 3, 5, 10],
               "cache_size" : [100, 200, 300]
             }

model = grid_search.GridSearchCV(estimator = svm, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train2_tfidf, train2_labels)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### The best parameters given by Grid Search CV are  
"C=1, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=100, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None"
#####  These parameters are considered to build the model.

In [None]:
from sklearn.svm import SVC
svm = SVC(C=1, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False,
          tol=0.001, cache_size=100, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None).fit(train_counts, train_labels)

####  Learning Curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve
 
# assume classifier and training data is prepared...
 
train_sizes, train_scores, test_scores = learning_curve(
        svm, train_counts, train_labels, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("SVMClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.0, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

#### Inference
From the graph of learning curve, it can be inferred that it has Low Variance and Low Bias

#### ROC Curve for SVM

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
 
svmc = SVC(C=1, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=True,
          tol=0.001, cache_size=100, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None).fit(train_counts, train_labels)

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_counts, train_labels_num, test_size=.25)
svm = svmc.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr, tpr, _ = roc_curve(y_test, svm.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print('ROC AUC: %0.2f' % roc_auc)
 
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### 2] Random Forest:
#### Parameter Optimisation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
ran_forest = RandomForestClassifier().fit(train_counts, train_labels)

param_grid = { "n_estimators"      : [100, 250, 200],
               "criterion"         : ["gini", "entropy"],
               "min_samples_split" : [2, 5, 10] 
             }

model = grid_search.GridSearchCV(estimator = ran_forest, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 1, iid = True, refit = True, cv=10)

model.fit(train_counts, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
 

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=250, criterion='gini', 
                                min_samples_split = 2).fit(train_counts, train_labels)

#### Learning Curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
 
# assume classifier and training data is prepared...
 
train_sizes, train_scores, test_scores = learning_curve(
        forest, train_counts, train_labels_num, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("RandomForestClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.6, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.show()
plt.gca().invert_yaxis()

### Inference
From the learning curve, it can be inferred that it has High Variance and Low Bias

#### ROC Curve Random Forest

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
 
forest = RandomForestClassifier(n_estimators=250, criterion='gini', 
                                min_samples_split = 2)

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_counts, train_labels_num, test_size=.25)
forest.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr1, tpr1, _ = roc_curve(y_test, forest.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr1, tpr1)
print('ROC AUC: %0.2f' % roc_auc)
 
# Plot of a ROC curve for a specific class
plt.plot(fpr1, tpr1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### 3] Logistic Regression
#### Parameter Optimization

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
lr = LogisticRegression().fit(train_counts, train_labels)

param_grid = { "C" : [0.5, 0.8, 1, 1.5],
               "intercept_scaling" : [0.5, 1, 2],
               "solver" :['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

model = grid_search.GridSearchCV(estimator = lr, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train_counts, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
 

In [None]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(C=1.5, intercept_scaling=0.5, solver='liblinear'
                             ).fit(train_counts, train_labels)

#### Learning Curve:

In [None]:
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
 
# assume classifier and training data is prepared...
 
train_sizes, train_scores, test_scores = learning_curve(
        logistic, train_counts, train_labels_num, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("LogisticRegressionClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.6, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.show()
plt.gca().invert_yaxis()

#### Inference :
From the learning curve, we can infer that this model gives Low Bias and High Variance

#### ROC Curve Logistic Regression

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
 
logistic = LogisticRegression(C=1.5, intercept_scaling=0.5, solver='liblinear')

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_counts, train_labels_num, test_size=.25)
log = logistic.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr2, tpr2, _ = roc_curve(y_test, log.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr2, tpr2)
print('ROC AUC: %0.2f' % roc_auc)
 
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr2, tpr2, label='ROC curve (area = %0.2f)' % roc_auc)
#plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### 4] AdaBoost
#### Parameter Optimisation

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import decomposition, grid_search

#Grid Search - Used to find the best combination of parameters
ada_boost = AdaBoostClassifier(base_estimator=None, algorithm='SAMME.R', random_state=None)

param_grid = {'n_estimators': [50, 80, 100, 150],
              'learning_rate': [0.1, 0.5, 0.8, 0.9, 1]
             }

model = grid_search.GridSearchCV(estimator = ada_boost, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train_counts, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(base_estimator=None, n_estimators=150, learning_rate=1, algorithm='SAMME.R', 
                              random_state=None).fit(train_counts, train_labels)

#### Learning Curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve
 
# assume classifier and training data is prepared...
 
train_sizes, train_scores, test_scores = learning_curve(
        adaboost, train_counts, train_labels_num, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("AdaBoostClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.6, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.show()
plt.gca().invert_yaxis()

#### Inference
From the Learning Curve, it can be inferred that adaboost classifier has Low variance and Low bias.

#### ROC Curve for AdaBoost

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
 
adaboost = AdaBoostClassifier(base_estimator=None, n_estimators=150, learning_rate=1, algorithm='SAMME.R', 
                              random_state=None)    
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_counts, train_labels_num, test_size=.25)
ada = adaboost.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr3, tpr3, _ = roc_curve(y_test, ada.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr3, tpr3)
print('ROC AUC: %0.2f' % roc_auc)
 
# Plot of a ROC curve for a specific class
plt.plot(fpr3, tpr3, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

###  Data Representation 2: Tfidf [Term Frequency Inverse Document Frequency]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)
train_tfidf = tfidf_transformer.fit_transform(train_counts)

### Learning Models
### 1] SVM
#### Parameter Optimisation

In [None]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
svm = SVC()

param_grid = { "C"      : [1, 6, 10],
               "kernel" : ["rbf", "linear", "poly", "sigmoid"],
               "degree" : [2, 3, 5, 10],
               "cache_size" : [100, 200, 300]
             }

model = grid_search.GridSearchCV(estimator = svm, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train_tfidf, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### The best parameters given by Grid Search CV are  
"C = 6, cache_size = 100, degree = 2, kernel = 'linear'"
#####  These parameters are considered to build the model.

In [None]:
from sklearn.svm import SVC
svm = SVC(C = 6, cache_size = 100, degree = 2, kernel = 'linear').fit(train_tfidf, train_labels_num)

#### Learning curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
 
svm = SVC(C = 6, cache_size = 100, degree = 2, kernel = 'linear').fit(train_tfidf, train_labels_num)
 
train_sizes, train_scores, test_scores = learning_curve(
        svm, train_tfidf, train_labels_num, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 

plt.title("SVMClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.0, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

### Inference :
From the Learning Curve, it can be inferred that SVM classifier has Low Variance and Low Bias

#### ROC Curve for SVM

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
 
svmc = SVC(C = 6, cache_size = 100, degree = 2, kernel = 'linear').fit(train_tfidf, train_labels_num)

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_counts, train_labels_num, test_size=.25)
svm = svmc.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr, tpr, _ = roc_curve(y_test, svm.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print('ROC AUC: %0.2f' % roc_auc)
 
# Plot of a ROC curve for a specific class

plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### 2] Random Forest
#### Parameter Optimisation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
ran_forest = RandomForestClassifier()

param_grid = { "n_estimators"      : [100, 250, 200],
               "criterion"         : ["gini", "entropy"],
               "min_samples_split" : [2, 5, 10] 
             }

model = grid_search.GridSearchCV(estimator = ran_forest, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train_tfidf, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
forest = RandomForestClassifier(criterion = 'gini', min_samples_split = 5, n_estimators = 200).fit(train_tfidf, train_labels_num)

#### Learning curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve
 
forest = RandomForestClassifier(criterion = 'gini', min_samples_split = 5, n_estimators = 200).fit(train_tfidf, train_labels)

train_sizes, train_scores, test_scores = learning_curve(
        forest, train_tfidf, train_labels_num, cv=10, n_jobs=4, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("RandomForestClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.6, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

#### Inference:
From the Learning Curve, it can be inferred that Random Forest classifier has Low Bias, High Variance

### 3] Logistic Regression
#### Parameter Optimisation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
lr = LogisticRegression()

param_grid = { "C" : [0.5, 0.8, 1, 1.5],
               "intercept_scaling" : [0.5, 1, 2],
               "solver" :['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

model = grid_search.GridSearchCV(estimator = lr, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train_tfidf, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
logistic = LogisticRegression(C = 1.5, intercept_scaling = 0.5, solver = 'lbfgs').fit(train_tfidf, train_labels_num)

#### Learning curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
 
logistc = LogisticRegression(C = 1.5, intercept_scaling = 0.5, solver = 'liblinear').fit(train_tfidf, train_labels)

train_sizes, train_scores, test_scores = learning_curve(
        logistic, train_tfidf, train_labels_num, cv=10, n_jobs=4, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("LogisticRegressionClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.6, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

### Inference :
From the Learning Curve, it can be inferred that Logistic Regression classifier has Low Bias, High Variance

### 4] Ada Boost
#### Parameter Optimisation

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import decomposition, grid_search

#Grid Search - Used to find the best combination of parameters
ada_boost = AdaBoostClassifier(base_estimator=None, algorithm='SAMME.R', random_state=None)

param_grid = {'n_estimators': [50, 80, 100, 150],
              'learning_rate': [0.1, 0.5, 0.8, 0.9, 1]
             }

model = grid_search.GridSearchCV(estimator = ada_boost, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train_tfidf, train_labels_num)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
adaboost = AdaBoostClassifier(learning_rate = 1, n_estimators = 150).fit(train_tfidf, train_labels_num)

#### Learning curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve
 
adaboost = AdaBoostClassifier(learning_rate = 1, n_estimators = 150).fit(train_tfidf, train_labels_num)

train_sizes, train_scores, test_scores = learning_curve(
        adaboost, train_tfidf, train_labels_num, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("AdaBoostClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.6, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

#### Inference :
From the Learning Curve, it can be inferred that Ada Boost classifier has Low Variance, Low Bias

## Task 2
###### In this task we consider the preprocessed text reviews for classification of reviews into 4 courses. The 4 courses are:
      1]Soups and Salads 
      2]Appetizers
      3]Main Course
      4]Dessserts
###### The data is represented in two different ways such as one hot encoding and TFIDF representation. After 4 course meal classsification, 
###### We predict most preferred food items in each of the categories.

### Generate the Training Data
The food words are classified into 4 courses. These course names are given as labels for the review which contains respective  food words.

In [None]:
words =  set(['meal', 'tea', 'coffee', 'salad', 'veggie', 'rice', 'teriyaki', 'chicken', 'carrot', 'tomato', 'beef', 
          'mushroom', 'chilli', 'shrimp', 'egg', 'broccoli', 'curry', 'tofu', 'lettuce', 'puff', 'noodle', 'meat', 
          'roll', 'wonton', 'bread', 'burger', 'bacon','bun', 'sprout','boba', 'pattie', 'bamboo', 'berry', 'bruschetta',
          'cheeto', 'chocolate', 'shake', 'cereal', 'nugget', 'corn', 'crevette','cocktail','daikon', 'donburi','dumpling',
          'doughnut', 'frie','fish', 'sandwich', 'cake','taco', 'goyuza', 'hamburger', 'haggi', 'kimmari','kimari', 
          'kimtchi', 'kimchi', 'lobster','lemonade', 'margarita','milkshake','mochi','milk', 'nacho', 'onigiri','pattie',
          'pasta','popcorn', 'pudding','pizza', 'rangoon', 'slider','mustard', 'sausage','spaghetti', 'sappora', 
          'salmon', 'seaweed','scallop','sansotei','shellfish', 'twinkie', 'zucchini', 'schnitzel', 'macaron', 'brulee', 
          'steak', 'baco', 'mimosa', 'salmon', 'calamari', 'gnocchi', 'crab', 'chilaquile', 'pancake', 'cocktail', 
          'buttermilk', 'caesar', 'buffalo', 'broccolini', 'pierogi', 'marshmallow', 'caramel', 'milkshake', 'carolina', 
          'coleslaw', 'coke', 'cheeseburger', 'banzai', 'swenson', 'bacon', 'hankerin', 'cookie', 'bun', 'ketchup', 
          'cheesecake', 'brownie',  'onion','anglaise','apricot','avocado','arroncini','bolognese','burrito','beet',
          'buffalo','bread','burrata','bacon','blueberry','boar','bun','biscotti','caramel','chimichurri','chestnut',
          'chorizo','chipotle','corn','chicken','croquette','cheese','caper','crab','chimi','cotta','clam','cream',
          'creme','cheesecake','cod','charcuterie','cider','egg','enchilada','fennel','fajita','fontina','fish','fajita',
          'flan','fig','grapefruit','guacamole','gremolata','goat','honey','halibut','haddock','jam','kale','lamb',
          'lobster','linguine','latte','mojito','mascarpone','mignonette','marinara','mushroom','mimosa','molacajete',
          'musse','nutella','oyster','oreo','orange','oolong','parmesan','pecorino','pasta','pumpkin','polenta','pork',
          'peanut','panna','pudding','pistachio','potato','pastry','quesadilla', 'ribollita','radish','risotto',
          'ratatouille','rosti','ramekin','ragu','rhubarb','ramekin','salsa','seafood','scotch','steak','sprout','salumi',
          'striploin','strawberry','shortrib','sandwiche','smoothie','tequila','tortilla','toritilla','tartare','toast',
          'tamale','tenderloin','tagliatelle','toffee','tart', 'vanilla','wonton'
         ])

main_course = ['meal', 'fish', 'rice', 'teriyaki', 'chicken', 'beef', 'curry', 'tofu', 'meat', 'bacon','crevette', 'donburi', 'lobster', 'mochi', 'onigiri', 'anglaise','bolognese','buffalo','boar','beef','chipotle','chicken','croquette','crab','chimi','enchilada','fajita','fontina','fish','fajita','gremolata','goat','lamb','lobster','linguine','mimosa','polenta','pork','quesadilla','ramekin','ragu','steak','salumi','striploin','shortrib','tamale','tenderloin','tagliatelle','wonton']
soups = ['tea', 'coffee', 'salad', 'veggie', 'carrot', 'tomato', 'chilli',  'broccoli', 'lettuce', 'boba', 'berry', 'corn', 'cocktail', 'lemonade', 'margarita', 'mustard', 'beet','chestnut','corn','cider','coffee','fennel','kale','lemon','latte','mojito','mascarpone','oolong','pomegranate','parmesan','pumpkin','ribollita','radish','sprout','tequila','tea','tomato']
desserts = ['chocolate', 'shake', 'doughnut', 'cake', 'haggi', 'milkshake', 'milk', 'pudding', 'apple','apricot','avocado','banana','blueberry','caramel','cream','creme','cheesecake','flan','fig','grapefruit','guacamole','honey','jam','molacajete','mousse','milkshake','nutella','oreo','orange','panna','pudding','pistachio','pastry','rhubarb','ramekin','strawberry','smoothie','toffee','tart','vanilla']
appetizers = ['egg', 'shrimp', 'puff', 'roll', 'wonton', 'bread', 'burger', 'mushroom', 'bun', 'sprout', 'pattie', 'bamboo', 'bruschetta', 'cheeto', 'cereal', 'nugget', 'daikon', 'dumpling', 'noodle', 'frie', 'sandwich', 'taco', 'goyuza', 'hamburger', 'kimmari','kimari', 'kimtchi', 'kimchi', 'nacho', 'pattie','pasta','popcorn','pizza', 'rangoon', 'slider', 'sausage', 'spaghetti', 'arroncini','butter','burrito','burrito','bread','bun','burrata','bacon','biscotti','charcuterie','chimichurri','chorizo','cheese','caper','cotta','clam','cod','egg','halibut','haddock','mignonette','marinara','mushroom','oyster','pecorino','pasta','peanut','potatoe','risotto','ratatouille','rosti','salsa','sandwiche','seafood','scotch','tortilla','toritilla','tartare','toast']
     
train2_labels = []
train2_data = dict_sentiments


for i in range(len(train2_data)):
    space = train2_data[i][0][1].split (" ")
    label = "appetizers"
    for j in range(len(main_course)):
        if main_course[j] in space:
            label = "main_course"
           
    for j in range(len(soups)):
        if soups[j] in space:
            label = "soups_and_salads"
           
    for j in range(len(desserts)):
        if desserts[j] in space:
            label = "desserts"

   
    train2_labels.append(label)

print(train2_labels) 

In [None]:
train2 = []
 
for i in range(len(train2_data)):
    word_list = []
    space = train2_data[i][0][1].split (" ")
    for j in range(len(words)):        
        if words[j] in space:
            word_list.append(words[j])
    #print(i, word_list)
    word = " ".join( word_list )
    #print(word)
    train2.append(word)    

In [None]:
train2_data_2 = []
 
for i in range(len(train2_data)):
    space = train2_data[i][0][1].split (" ")
    for j in range(len(words)):        
        if words[j] in space:
            train2_data_2.append(train2_data[i][0][1])
            break

### Data Representation 1: One Hot Encoding
Categorical data is converted into numerical representation by creating sparse matrix taking rows as reviews and columns
as food words. Matrix Value will be 1 if review contains food words, else 0. 

In [None]:
import numpy as np
from numpy import array

values = array(words)

b = len(train2_data)
c = len(words)
a=np.zeros((b, c))
 
for i in range(0, len(train2_data)):        
    for j in range(0, len(words)-1):
        if words[j] in train2_data[i][0][1]:
            a[i][j]=1
        else:
            a[i][j]=0
            
print(a)

In [None]:
# convert output matrix 'a' into dataframe.

df_train = pd.DataFrame(a, columns = ['meal', 'tea', 'coffee', 'salad', 'veggie', 'rice', 'teriyaki', 'chicken', 'carrot', 'tomato', 'beef', 
          'mushroom', 'chilli', 'shrimp', 'egg', 'broccoli', 'curry', 'tofu', 'lettuce', 'puff', 'noodle', 'meat', 
          'roll', 'wonton', 'bread', 'burger', 'bacon','bun', 'sprout','boba', 'pattie', 'bamboo', 'berry', 'bruschetta',
          'cheeto', 'chocolate', 'shake', 'cereal', 'nugget', 'corn', 'crevette','cocktail','daikon', 'donburi','dumpling',
          'doughnut', 'frie','fish', 'sandwich', 'cake','taco', 'goyuza', 'hamburger', 'haggi', 'kimmari','kimari', 
          'kimtchi', 'kimchi', 'lobster','lemonade', 'margarita','milkshake','mochi','milk', 'nacho', 'onigiri','pattie',
          'pasta','popcorn', 'pudding','pizza', 'rangoon', 'slider','mustard', 'sausage','spaghetti', 'sappora', 
          'salmon', 'seaweed','scallop','sansotei','shellfish', 'twinkie', 'zucchini', 'schnitzel', 'macaron', 'brulee', 
          'steak', 'baco', 'mimosa', 'salmon', 'calamari', 'gnocchi', 'crab', 'chilaquile', 'pancake', 'cocktail', 
          'buttermilk', 'caesar', 'buffalo', 'broccolini', 'pierogi', 'marshmallow', 'caramel', 'milkshake', 'carolina', 
          'coleslaw', 'coke', 'cheeseburger', 'banzai', 'swenson', 'bacon', 'hankerin', 'cookie', 'bun', 'ketchup', 
          'cheesecake', 'brownie',  'onion','anglaise','apricot','avocado','arroncini','bolognese','burrito','beet',
          'buffalo','bread','burrata','bacon','blueberry','boar','bun','biscotti','caramel','chimichurri','chestnut',
          'chorizo','chipotle','corn','chicken','croquette','cheese','caper','crab','chimi','cotta','clam','cream',
          'creme','cheesecake','cod','charcuterie','cider','egg','enchilada','fennel','fajita','fontina','fish','fajita',
          'flan','fig','grapefruit','guacamole','gremolata','goat','honey','halibut','haddock','jam','kale','lamb',
          'lobster','linguine','latte','mojito','mascarpone','mignonette','marinara','mushroom','mimosa','molacajete',
          'musse','nutella','oyster','oreo','orange','oolong','parmesan','pecorino','pasta','pumpkin','polenta','pork',
          'peanut','panna','pudding','pistachio','potato','pastry','quesadilla', 'ribollita','radish','risotto',
          'ratatouille','rosti','ramekin','ragu','rhubarb','ramekin','salsa','seafood','scotch','steak','sprout','salumi',
          'striploin','strawberry','shortrib','sandwiche','smoothie','tequila','tortilla','toritilla','tartare','toast',
          'tamale','tenderloin','tagliatelle','toffee','tart', 'vanilla','wonton'])

df_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train2_label = le.fit_transform(train2_labels)

## Learning Models
### SVM
#### Parameter Optimisation

In [None]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search
## Replace Logistic

#Grid Search - Used to find the best combination of parameters
svm = SVC()

param_grid = { "C"      : [1, 6, 10],
               "kernel" : ["rbf", "linear", "poly", "sigmoid"],
               "degree" : [2, 3, 5, 10],
               "cache_size" : [100, 200, 300]
             }

model = grid_search.GridSearchCV(estimator = svm, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(df_train, train2_label)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### The best parameters given by Grid Search CV are  
"C=6, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=100, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None"
#####  These parameters are considered to build the model.

In [None]:
from sklearn.svm import SVC
svm = SVC(C=6, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False,
          tol=0.001, cache_size=100, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None).fit(df_train, train2_label)

### Learning Curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve
 
# assume classifier and training data is prepared...
 
train_sizes, train_scores, test_scores = learning_curve(
        svm, df_train, train2_label, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("SVMClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.0, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

#### Inference
From the graph of learning curve, it can be inferred that it has Low Variance and Low Bias

### Data Representation 2: TfIdf

In [None]:
train2_counts = count_vect.fit_transform(train2)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
train2_tfidf = tfidf_transformer.fit_transform(train2_counts)

### Learning Model
### SVM
#### Parameter Optimisation

In [None]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import decomposition, grid_search


#Grid Search - Used to find the best combination of parameters
svm = SVC()

param_grid = { "C"      : [1, 6, 10],
               "kernel" : ["rbf", "linear", "poly", "sigmoid"],
               "degree" : [2, 3, 5, 10],
               "cache_size" : [100, 200, 300]
             }

model = grid_search.GridSearchCV(estimator = svm, param_grid = param_grid, scoring = 'accuracy', verbose = 2, n_jobs = 4, iid = True, refit = True, cv=10)

model.fit(train2_tfidf, train2_labels)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### The best parameters given by Grid Search CV are  
"C=10, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=100, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None"
#####  These parameters are considered to build the model.

In [None]:
from sklearn.svm import SVC
svm = SVC(C=10, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False,
          tol=0.001, cache_size=100, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None).fit(train2_tfidf, train2_labels)

### Learning Curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve
 
# assume classifier and training data is prepared...
 
train_sizes, train_scores, test_scores = learning_curve(
        svm, train2_tfidf, train2_labels, cv=10, n_jobs=-1, train_sizes=np.linspace(.1, 1., 10), verbose=0)
 
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
 
plt.figure()
plt.title("SVMClassifier")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim((0.0, 1.01))
plt.gca().invert_yaxis()
plt.grid()
 
# Plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Test score")
 
# Plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                 alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                 alpha=0.1, color="r")
 
# Draw the plot and reset the y-axis
plt.draw()
plt.legend()
plt.show()
plt.gca().invert_yaxis()

### Inference
From the graph of learning curve, it can be inferred that it has Low Variance and Low Bias

##### A similar appraoch is carried out for other models such as AdaBoost, Logistic Regression, Random Forest. 

## Performance Evaluation
### Task 1
#### Technique 1: SVM

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier


clf1 = SVC(C=1, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False,
          tol=0.001, cache_size=100, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)
scores1 = cross_val_score(clf1, train_counts, train_labels_num, cv=5)

clf2 = SVC(C = 1, cache_size = 100, degree = 2, kernel = 'linear').fit(train_tfidf, train_labels_num)
scores2 = cross_val_score(clf2, train_tfidf, train_labels_num, cv=5)

print(scores1) 
print(scores2)

avg1 = 0
avg2 = 0

for i in range(5):
    avg1 = avg1 + scores1[i]
    avg2 = avg2 + scores2[i]
    
avg1 = avg1 / 5
avg2 = avg2/5
    
print("Average Accuracy of Bag Of Words representation :  ", avg1)    
print("Average Accuracy of TFIDF representation :  ", avg2)

#### Technique 2: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf1 = RandomForestClassifier(n_estimators=200, criterion='entropy', 
                                min_samples_split = 2)
scores1 = cross_val_score(clf1, train_counts, train_labels_num, cv=5)

clf2 = RandomForestClassifier(criterion = 'entropy', min_samples_split = 2, n_estimators = 200)
scores2 = cross_val_score(clf2, train_tfidf, train_labels_num, cv=5)

print(scores1) 
print(scores2) 

avg1 = 0
avg2 = 0

for i in range(5):
    avg1 = avg1 + scores1[i]
    avg2 = avg2 + scores2[i]
    
avg1 = avg1 / 5
avg2 = avg2/5
    
print("Average Accuracy of Preprocessing 1 :  ", avg1)    
print("Average Accuracy of Preprocessing 2 :  ", avg2)

#### Technique 3: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf1 = LogisticRegression(C=1.5, intercept_scaling=0.5, solver='newton-cg')
scores1 = cross_val_score(clf1, train_counts, train_labels_num, cv=5)

clf2 = LogisticRegression(C = 1.5, intercept_scaling = 0.5, solver = 'newton-cg')
scores2 = cross_val_score(clf2, train_tfidf, train_labels_num, cv=5)

print(scores1) 
print(scores2)

avg1 = 0
avg2 = 0

for i in range(5):
    avg1 = avg1 + scores1[i]
    avg2 = avg2 + scores2[i]
    
avg1 = avg1 / 5
avg2 = avg2/5
    
print("Average Accuracy of Preprocessing 1 :  ", avg1)    
print("Average Accuracy of Preprocessing 2 :  ", avg2)

#### Technique 4: AdaBoost

In [None]:
clf1 = AdaBoostClassifier(base_estimator=None, n_estimators=150, learning_rate=1, algorithm='SAMME.R',random_state=None)
scores1 = cross_val_score(clf1, train_counts, train_labels_num, cv=5)

clf2 = AdaBoostClassifier(learning_rate = 0.8, n_estimators = 150).fit(train_tfidf, train_labels_num)
scores2 = cross_val_score(clf2, train_tfidf, train_labels_num, cv=5)

print(scores1) 
print(scores2)

avg1 = 0
avg2 = 0

for i in range(5):
    avg1 = avg1 + scores1[i]
    avg2 = avg2 + scores2[i]
    
avg1 = avg1 / 5
avg2 = avg2/5
    
print("Average Accuracy of Preprocessing 1 :  ", avg1)    
print("Average Accuracy of Preprocessing 2 :  ", avg2)

###  Inference :
We can observe that SVM Technique performs better than other 3 techniques. Bag of words representation performs better than TFIDF. Hence SVM model for Bag of words representation is considered for task1.

## Task 2
### Cross Validation of svm


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf1 = SVC(C=1, kernel='linear', degree=2, gamma=1, coef0=0.0, shrinking=True, probability=False,
          tol=0.001, cache_size=100, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)
scores1 = cross_val_score(clf1, train_counts, train_labels_num, cv=5)

clf2 = SVC(C = 1, cache_size = 100, degree = 2, kernel = 'linear').fit(train_tfidf, train_labels_num)
scores2 = cross_val_score(clf2, train_tfidf, train_labels_num, cv=5)

print(scores1) 
print(scores2)

avg1 = 0
avg2 = 0

for i in range(5):
    avg1 = avg1 + scores1[i]
    avg2 = avg2 + scores2[i]
    
avg1 = avg1 / 5
avg2 = avg2/5
    
print("Average Accuracy of one hot encoding :  ", avg1)    
print("Average Accuracy of TFIDF :  ", avg2)

###  Inference :
We can observe that SVM Technique performs better than other 3 techniques. TFIDF  representation performs better than one hot encoding. Hence TFIDF representation is considered for task 2.

## Get the top items

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_pipe = TfidfVectorizer()
Xtr = vec_pipe.fit_transform(train2)
features = vec_pipe.get_feature_names()

In [None]:
def top_tfidf_feats(row, features, top_n=5):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
def top_feats_in_doc(Xtr, features, row_id, top_n=5):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
top_feats_in_doc(Xtr, features, 2, top_n=5)

### Separate positive and negative reviews

In [None]:
pos_reviews = []
neg_reviews = []
dict_pos = {}
dict_neg = {}
j=0
k=0

for i in range(len(train2_data)):
    if train2_data[i][1] == "pos":
        pos_reviews.append(train2[i])
        dict_pos[i] = train2_data[i]
        j=j+1
    else:
        neg_reviews.append(train2[i])  
        dict_neg[i] = train2_data[i]
        k=k+1
        
pos_counts = count_vect.fit_transform(pos_reviews)
neg_counts = count_vect.fit_transform(neg_reviews)

### Top 5 items in positive category:

In [None]:
feature_names = vec_pipe.get_feature_names()

feature_array = np.array(vec_pipe.get_feature_names())
tfidf_sorting = np.argsort(pos_counts.toarray()).flatten()[::-1]

n = 100
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

In [None]:
top_main_course = []
top_soups = []
top_appetizers = []
top_desserts = []
count_main = 0
count_soups = 0;
count_appetizers = 0
count_desserts = 0

for i in range(len(top_n)):
    if count_main < 5:
        for j in range(len(main_course)):
            if top_n[i] in main_course[j]:
                top_main_course.append(top_n[i])
                count_main = count_main + 1
                break
    if count_soups < 5:
        for j in range(len(soups)):
            if top_n[i] in soups[j]:
                top_soups.append(top_n[i])
                count_soups = count_soups + 1
                break
    if count_desserts < 5:            
        for j in range(len(desserts)):
            if top_n[i] in desserts[j]:
                top_desserts.append(top_n[i])
                count_desserts = count_desserts + 1
                break
    if count_appetizers < 5:        
        for j in range(len(appetizers)):
            if top_n[i] in appetizers[j]:
                top_appetizers.append(top_n[i])
                count_appetizers = count_appetizers + 1

In [None]:
print(top_main_course)
print(top_soups)
print(top_desserts)
print(top_appetizers)