In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
def train_model(train_data):
  training_corpus = [f"{premise} {hypothesis}" for premise, hypothesis in zip(train_data['premise'], train_data['hypothesis'])]

  vectorizer = TfidfVectorizer()
  vectorizer.fit(training_corpus)

  tfidf_premise = vectorizer.transform(train_data['premise'].values.astype('U'))
  tfidf_hypothesis = vectorizer.transform(train_data['hypothesis'].values.astype('U'))

  train_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))
  train_labels = train_data['label']

  model = LogisticRegression(random_state=0, max_iter=100000, solver='lbfgs', multi_class='auto')
  model.fit(train_features, train_labels)

  print('Trained Model')
  return model, vectorizer


In [3]:
def test_model(model, vectorizer, test_data):
  test_corpus = [f"{premise} {hypothesis}" for premise, hypothesis in zip(test_data['premise'], test_data['hypothesis'])]

  tfidf_premise = vectorizer.transform(test_data['premise'].values.astype('U'))
  tfidf_hypothesis = vectorizer.transform(test_data['hypothesis'].values.astype('U'))

  test_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))
  test_labels = test_data['label']

  pred_labels = model.predict(test_features)
  print(pred_labels)

  score = model.score(test_features, test_labels) * 100
  print("The classification accuracy for Logistic regression with TF-IDF features is {:.2f}%.".format(score))



In [4]:
train_data = pd.read_csv("./data/train.csv")
dev_data = pd.read_csv("./data/dev.csv")

model, vectorizer = train_model(train_data)
test_model(model, vectorizer, dev_data)



Trained Model
[1 0 1 ... 1 0 1]
The classification accuracy for Logistic regression with TF-IDF features is 65.41%.
