# Group 19 - Natural Language Inference (A) - Traditional ML Approach (A)

## Dependency Management

In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from joblib import dump, load

## Define TF-IDF Embedding Methods.

In [3]:
def build_train_data(train_data):
  training_corpus = [f"{premise} {hypothesis}" for premise, hypothesis in zip(train_data['premise'], train_data['hypothesis'])]

  vectorizer = TfidfVectorizer()
  vectorizer.fit(training_corpus)

  tfidf_premise = vectorizer.transform(train_data['premise'].values.astype('U'))
  tfidf_hypothesis = vectorizer.transform(train_data['hypothesis'].values.astype('U'))

  train_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))
  train_labels = train_data['label']

  return train_features, train_labels, vectorizer

In [4]:
def build_test_data(test_data, vectorizer):
  test_corpus = [f"{premise} {hypothesis}" for premise, hypothesis in zip(test_data['premise'], test_data['hypothesis'])]

  tfidf_premise = vectorizer.transform(test_data['premise'].values.astype('U'))
  tfidf_hypothesis = vectorizer.transform(test_data['hypothesis'].values.astype('U'))

  test_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))
  test_labels = test_data['label']

  return test_features, test_labels

## Define Models - Finding Best Hyper-parameters

In [5]:
def build_logistic_regression_model(train_features, train_labels):
  param_grid = {
    'C': [0.01, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'newton-cg', 'saga']
  }

  grid_search = GridSearchCV(LogisticRegression(max_iter=100000, multi_class='auto'), param_grid, cv=5, scoring='accuracy', n_jobs=-1,verbose=4)

  grid_search.fit(train_features, train_labels)

  best_model = grid_search.best_estimator_
  print(grid_search.best_params_)

  return best_model

In [6]:
def build_random_forest_model(train_features, train_labels):
  param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
  }

  grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

  grid_search.fit(train_features, train_labels)

  best_model = grid_search.best_estimator_
  print(grid_search.best_params_)

  return best_model

In [7]:
def build_gradient_boosting_classifier(train_features, train_labels):
  param_grid = {
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 6, 7],
    'subsample': [0.6, 0.8, 1.0]
  }

  grid_search = GridSearchCV(estimator=XGBClassifier(), param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

  grid_search.fit(train_features, train_labels)
    
  best_model = grid_search.best_estimator_
  print(grid_search.best_params_)

  return best_model

## Testing

In [8]:
def test_model(model, test_features, test_labels, writeFile=False):
  if (writeFile):
    pred_labels = model.predict(test_features)
    write_to_csv(pred_labels)

  score = model.score(test_features, test_labels) * 100
  print("The classification accuracy for the model is {:.2f}%.".format(score))

In [9]:
def write_to_csv(pred_labels):
  predictions_df = pd.DataFrame(pred_labels, columns=['prediction'])
  predictions_df.to_csv('Group_19_A.csv', index=False)

## Get data and build TF-IDF embeddings.

In [10]:
train_data = pd.read_csv("./data/train.csv")
dev_data = pd.read_csv("./data/dev.csv")

train_features, train_labels, vectorizer= build_train_data(train_data)
test_features, test_labels = build_test_data(dev_data, vectorizer) # Dev data for now

## Train individual models.

In [11]:
log_reg = build_logistic_regression_model(train_features, train_labels)
test_model(log_reg, test_features, test_labels)
dump(log_reg, 'models/logistic_regression_model.joblib')

Fitting 5 folds for each of 18 candidates, totalling 90 fits
{'C': 0.1, 'solver': 'lbfgs'}
The classification accuracy for the model is 66.10%.


['models/logistic_regression_model.joblib']

In [12]:
random_forest = build_random_forest_model(train_features, train_labels)
test_model(random_forest, test_features, test_labels)
dump(random_forest, 'models/random_forest_model.joblib')

Fitting 5 folds for each of 162 candidates, totalling 810 fits
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
The classification accuracy for the model is 66.29%.


['models/random_forest_model.joblib']

In [13]:
gradient_boosting_model = build_gradient_boosting_classifier(train_features, train_labels)
test_model(gradient_boosting_model, test_features, test_labels)
dump(gradient_boosting_model, 'models/gradient_boosting_model.joblib')

Fitting 5 folds for each of 54 candidates, totalling 270 fits
{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
The classification accuracy for the model is 68.38%.


['models/gradient_boosting_model.joblib']

## Build ensemble model.

In [14]:
log_reg = load('models/logistic_regression_model.joblib')
random_forest = load('models/random_forest_model.joblib')
gradient_boosting_model = load('models/gradient_boosting_model.joblib')

In [15]:
ensemble_model = VotingClassifier(estimators=[
    ('Logistic Regression', log_reg),
    ('Random Forest', random_forest),
    ('Gradient Boosting', gradient_boosting_model)
], voting='hard')

ensemble_model.fit(train_features, train_labels)

test_model(ensemble_model, test_features, test_labels)

The classification accuracy for the model is 67.09%.
