In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import os
import pandas as pd
import numpy as np
import re
import time
import pickle

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets, svm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectPercentile, SelectFpr, chi2, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from tqdm import tqdm,trange
from collections import defaultdict,Counter
from text_to_num import text2num,alpha2digit
from mlxtend.classifier import StackingCVClassifier

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
def preprocess(file, 
               process = None, 
               convert_to_seconds = False):
    curr_dir = os.path.dirname(os.path.realpath('__file__'))
    if not process:
        train_file = os.path.join(curr_dir, 'COMP30027_2021_Project2_datasets\\'+file)
    else:
        train_file = os.path.join(curr_dir, 'COMP30027_2021_Project2_datasets\\recipe_text_features_' + process + '\\' + file)

    data = pd.read_csv(train_file, index_col = False, delimiter = ',')
    if convert_to_seconds:
        tqdm.pandas(desc="Converting...")
        data['seconds'] = data['steps'].progress_apply(convert_step_to_time)
    return data

In [3]:
time_retrieval = lambda words, sentence : re.findall(re.compile('|'.join(
        [n + w + "\\b" for w in words for n in ["[0-9]+\s*","another ","few ","a ","an ","several "]])), sentence)

kw_retrieval = lambda words, sentence : re.findall(re.compile('|'.join(
        ["\\b" + w + "\\b" for w in words])), sentence)

def convert_step_to_time(step, keyword_check = True):
    total_time = 0 # in seconds
    times = defaultdict(list)
    time_unit = {"second":1, "minute":60, "hour":60*60}
    kw_dict = {"overnight":8, "night":8, "nights":8, "freeze":5, "refrigerate":3,
               "day":12, "cook on low":7, "slow cook":7, "crockpot":7, "crock pot":7,
               "cook low":7, "boil":1, "heat":1, "bread machine":1.5
               }
    
    # Iterate through each step to find time value using RE
    numeric_step = alpha2digit(step,'en')
    times["second"] = time_retrieval(["more seconds","more second","more secs","more sec","seconds","second","secs","sec","s"],numeric_step)
    times["minute"] = time_retrieval(["more minutes","more minute","more min","more mins","minutes","minute","min","mins","m","ms"],numeric_step)
    times["hour"] = time_retrieval(["more hours","more hour","more hrs","more hr","hours","hour","hrs","hr","h","hs"],numeric_step)
    special_keywords = kw_retrieval(kw_dict.keys(),numeric_step) if keyword_check else None

    for unit in time_unit.keys():
        #total_time += sum([float(re.findall(r'[0-9]+',t.split()[0])[0]) * time_unit[unit] for t in times[unit]])
        count = 0
        for t in times[unit]:
            numerator = t.split()[0]

            if numerator in ["another","a","an"]:
                count = 1
            elif numerator in ["few","several"]:
                count = 4
            else:
                count = float(re.findall(r'[0-9]+',numerator)[0])

            total_time += count * time_unit[unit]

    if keyword_check and not total_time and special_keywords:
        total_time += 60*60*kw_dict[special_keywords[0]]

    if total_time == 0:
        return None
    return total_time

In [4]:
time_retrieval = lambda words, sentence : re.findall(re.compile('|'.join(
        [n + w + "\\b" for w in words for n in ["[0-9]+\s*","another ","few ","a ","an ","several "]])), sentence)

kw_retrieval = lambda words, sentence : re.findall(re.compile('|'.join(
        ["\\b" + w + "\\b" for w in words])), sentence)

def convert_step_to_time(step, keyword_check = True):
    total_time = 0 # in seconds
    times = defaultdict(list)
    time_unit = {"second":1, "minute":60, "hour":60*60}
    kw_dict = {"overnight":8, "night":8, "nights":8, "freeze":5, "refrigerate":3,
               "day":12, "cook on low":7, "slow cook":7, "crockpot":7, "crock pot":7,
               "cook low":7, "boil":1, "heat":1, "bread machine":1.5
               }
    
    # Iterate through each step to find time value using RE
    numeric_step = alpha2digit(step,'en')
    times["second"] = time_retrieval(["more seconds","more second","more secs","more sec","seconds","second","secs","sec","s"],numeric_step)
    times["minute"] = time_retrieval(["more minutes","more minute","more min","more mins","minutes","minute","min","mins","m","ms"],numeric_step)
    times["hour"] = time_retrieval(["more hours","more hour","more hrs","more hr","hours","hour","hrs","hr","h","hs"],numeric_step)
    special_keywords = kw_retrieval(kw_dict.keys(),numeric_step) if keyword_check else None

    for unit in time_unit.keys():
        #total_time += sum([float(re.findall(r'[0-9]+',t.split()[0])[0]) * time_unit[unit] for t in times[unit]])
        count = 0
        for t in times[unit]:
            numerator = t.split()[0]

            if numerator in ["another","a","an"]:
                count = 1
            elif numerator in ["few","several"]:
                count = 4
            else:
                count = float(re.findall(r'[0-9]+',numerator)[0])

            total_time += count * time_unit[unit]

    if keyword_check and not total_time and special_keywords:
        total_time += 60*60*kw_dict[special_keywords[0]]

    if total_time == 0:
        return None
    return total_time

In [5]:
train_data = pd.read_csv('COMP30027_2021_Project2_datasets\\recipe_train.csv')
train_data_50 = pd.read_csv('COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\train_steps_doc2vec50.csv',header=None, names = ['coord'+str(i) for i in range(50)])

In [6]:
train = pd.concat([train_data,train_data_50],axis=1)
y = train['duration_label']
x = train.drop(['name','steps','ingredients','duration_label'], axis=1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [5]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1, verbose=True)
c2 = GaussianNB()
c3 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1, verbose=False)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1, verbose=False)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True, verbose=False)

#sclf.fit(X_train,y_train)
print('Process takes ',time.time()-start, 'seconds')

Process takes  0.0 seconds


In [31]:
preds = pd.Series(sclf.predict(X_test))
classification_report(y_test, preds)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


'              precision    recall  f1-score   support\n\n         1.0       0.72      0.71      0.71      2646\n         2.0       0.72      0.77      0.74      3053\n         3.0       0.49      0.20      0.28       301\n\n    accuracy                           0.71      6000\n   macro avg       0.64      0.56      0.58      6000\nweighted avg       0.71      0.71      0.71      6000\n'

In [32]:
print(f'Accuracy: {100*accuracy_score(preds, y_test):.2f}%')

Accuracy: 71.32%


In [33]:
cross_val_score(sclf,x,y).mean()

Fitting 3 classifiers...
Fitting classifier1: randomforestclassifier (1/3)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Par

0.71325

In [35]:
cross_val_score(AdaBoostClassifier(),x,y).mean()

0.6760249999999999

In [36]:
cross_val_score(BaggingClassifier(c2),x,y).mean()

0.63335

In [11]:
train_data = pd.read_csv('COMP30027_2021_Project2_datasets\\input\\train_seconds.csv')
train_data_100 = pd.read_csv('COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\train_steps_doc2vec100.csv',header=None, names = ['coord'+str(i) for i in range(100)])
train = pd.concat([train_data,train_data_100],axis=1)
y100 = train['duration_label']
x100 = train.drop(['name','steps','ingredients','duration_label'], axis=1)

In [12]:
train = train.dropna()
y100 = train['duration_label']
x100 = train.drop(['name','steps','ingredients','duration_label'], axis=1)

In [12]:
start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]:

In [13]:
start = time.time()
print(cross_val_score(SVC(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7294058455715811

In [15]:
start = time.time()
print(cross_val_score(GaussianNB(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.5699892538439091
Process takes  0.5106613636016846 seconds


In [16]:
start = time.time()
print(cross_val_score(DecisionTreeClassifier(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.6779986540110515
Process takes  29.413482666015625 seconds


In [17]:
start = time.time()
print(cross_val_score(LogisticRegression(max_iter=10000),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [18]:
start = time.time()
print(cross_val_score(RandomForestClassifier(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7441915533184241
Process takes  138.90527200698853 seconds


In [19]:
start = time.time()
print(cross_val_score(AdaBoostClassifier(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7364848804798234
Process takes  91.97082996368408 seconds


In [21]:
start = time.time()
print(cross_val_score(KNeighborsClassifier(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7552663885195186
Process takes  12.221656560897827 seconds


In [20]:
start = time.time()
print(cross_val_score(BaggingClassifier(DecisionTreeClassifier()),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7482447571595648
Process takes  179.22210359573364 seconds


In [22]:
start = time.time()
print(cross_val_score(BaggingClassifier(KNeighborsClassifier()),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7566651080659482
Process takes  114.75170159339905 seconds


In [23]:
start = time.time()
print(cross_val_score(BaggingClassifier(SVC()),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7290062410979176
Process takes  3350.5638234615326 seconds


In [13]:
start = time.time()
print(cross_val_score(GradientBoostingClassifier(),x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7785866594561183
Process takes  1469.8330869674683 seconds


In [53]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
start = time.time()
param_grid = {'n_neighbors': np.arange(38,1000,20)}
rcv = GridSearchCV(KNeighborsClassifier(),param_grid, cv=5)
rcv.fit(x100,y100)
print('Process takes ',time.time()-start, 'seconds')
print(rcv.best_params_)
rcv.best_score_

Process takes  1015.8914716243744 seconds
{'n_neighbors': 38}


0.7703660191201482

In [47]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = GaussianNB()
c3 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100, cv = 10).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7675407625166687
Process takes  425.40338587760925 seconds


In [37]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = AdaBoostClassifier()
c3 = KNeighborsClassifier(n_neighbors=38)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7761890203919485
Process takes  291.49292397499084 seconds


In [43]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)
c3 = KNeighborsClassifier(n_neighbors=38)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7817265968809544
Process takes  246.60619163513184 seconds


In [28]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = AdaBoostClassifier()
c3 = KNeighborsClassifier(n_neighbors=21)
lr = KNeighborsClassifier(n_neighbors=21)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7719644451629282
Process takes  267.93580436706543 seconds


In [29]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = AdaBoostClassifier()
c3 = KNeighborsClassifier(n_neighbors=21)
lr = BaggingClassifier(DecisionTreeClassifier())

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7517270543982342
Process takes  276.2685396671295 seconds


In [30]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = AdaBoostClassifier()
lr = KNeighborsClassifier(n_neighbors=21)
c3 = BaggingClassifier(DecisionTreeClassifier())

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7691672342175991
Process takes  571.7859828472137 seconds


In [35]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = AdaBoostClassifier()
c3 = KNeighborsClassifier(n_neighbors=38)
c4 = KNeighborsClassifier(n_neighbors=38)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3,c4],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7753612074479902
Process takes  333.37484431266785 seconds


In [36]:
start = time.time()
c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = AdaBoostClassifier()
c3 = KNeighborsClassifier(n_neighbors=38)
c4 = DecisionTreeClassifier()
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3,c4],
                            meta_classifier=lr, use_probas=True)

start = time.time()
print(cross_val_score(sclf,x100,y100).mean())
print('Process takes ',time.time()-start, 'seconds')

0.7763602899282521
Process takes  352.87842202186584 seconds


In [46]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

test_data = preprocess('recipe_test.csv', convert_to_seconds=True)#pd.read_csv('COMP30027_2021_Project2_datasets\\recipe_train.csv')
test_data_100 = pd.read_csv('COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\test_steps_doc2vec100.csv',header=None, names = ['coord'+str(i) for i in range(100)])
test = pd.concat([test_data,test_data_100],axis=1)
x100test = test.drop(['name','steps','ingredients'], axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(x100)

x100test = imp.transform(x100test)

c1 = RandomForestClassifier(n_estimators=100,min_samples_split=50,min_samples_leaf=1, criterion='gini', n_jobs=-1)
c2 = GaussianNB()#KNeighborsClassifier(n_neighbors=38)
c3 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[c1,c2,c3],
                            meta_classifier=lr, use_probas=True)

sclf.fit(x100,y100)
Y_test = sclf.predict(x100test)

out = pd.DataFrame({'duration_label':Y_test})
out.index += 1
out.to_csv('output/out_'+'randomforest_gaussiannb_logisticreg'+'.csv',index_label = 'id')

Converting...: 100%|██████████| 10000/10000 [00:11<00:00, 834.87it/s]


In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

test_data = pd.read_csv('COMP30027_2021_Project2_datasets\\input\\test_seconds.csv')
test_data_100 = pd.read_csv('COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\test_steps_doc2vec100.csv',header=None, names = ['coord'+str(i) for i in range(100)])
test = pd.concat([test_data,test_data_100],axis=1)
x100test = test.drop(['name','steps','ingredients'], axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(x100)

x100test = imp.transform(x100test)

model = GradientBoostingClassifier()

model.fit(x100,y100)
Y_test = model.predict(x100test)

out = pd.DataFrame({'duration_label':Y_test})
out.index += 1
out.to_csv('output/out_'+'GradientBoosting'+'.csv',index_label = 'id')