In [1]:
import pandas as pd
import numpy as np
from load_data import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



In [40]:
def preprocess(df):
    arr_steps = df['steps'].copy().to_numpy()
    arr_ingr = df['ingredients'].copy().to_numpy()
    for i in range(len(arr_steps)):
        arr_steps[i] = str(arr_steps[i]).replace("[", "").replace("]", "").replace(",", "").replace("'", "")
        arr_ingr[i] = str(arr_ingr[i]).replace("[", "").replace("]", "").replace(",", "").replace("'", "")
    
    X = arr_steps + arr_ingr
    return X

In [3]:
arr_steps = df_train['steps'].copy().to_numpy()
arr_ingr = df_train['ingredients'].copy().to_numpy()
for i in range(len(arr_steps)):
    arr_steps[i] = str(arr_steps[i]).replace("[", "").replace("]", "").replace(",", "").replace("'", "")
    arr_ingr[i] = str(arr_ingr[i]).replace("[", "").replace("]", "").replace(",", "").replace("'", "")
steps_srs = pd.Series(arr_steps)

steps = steps_srs.str.cat(sep=' ')
tokens = word_tokenize(steps)
stop_words = set(stopwords.words('english'))
tokens = list(set(tokens))  # remove duplicates
tokens = [w for w in tokens if not w in stop_words] # remove stop words
# stemming
ps = PorterStemmer()
for i in range(len(tokens)):
    tokens[i] = ps.stem(tokens[i])

In [47]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate


X = preprocess(df_train)
y = df_train['duration_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [49]:
from sklearn.linear_model import LogisticRegression

penalties = ['l1', 'l2', 'elasticnet', 'none']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
lg = LogisticRegression(random_state=0, max_iter=1000, solver='liblinear', penalty='l1', C=1.0)
lg.fit(X_train, y_train)
print(lg.score(X_test, y_test))

0.7955


In [50]:
X_train = preprocess(df_train)
X_test = preprocess(df_test)
y_train = df_train['duration_label']
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
lg = LogisticRegression(random_state=0, max_iter=1000, solver='liblinear', penalty='l1', C=1.0)
lg.fit(X_train, y_train)
predicts = lg.predict(X_test)

In [51]:
ids = np.array(range(len(predicts))) + 1
output = pd.DataFrame({'id': ids, 'duration_label': predicts})
output.to_csv('output.csv', index=False)

In [48]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear', "sag", "saga"]
penalty = ['l2', 'l1', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.791344 using {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.762906 (0.005463) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.761156 (0.001928) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.770250 (0.003837) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.763437 (0.005097) with: {'C': 100, 'penalty': 'l2', 'solver': 'sag'}
0.764406 (0.005287) with: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'lbfgs'}
0.747844 (0.005790) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'sag'}
0.760531 (0.005051) with: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
0.000000 (0.000000) wi

In [121]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=0, max_leaf_nodes=100)
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.767375

In [118]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 200, random_state = 42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.779