# Logistic Regression

In [None]:
import pandas as pd
poor = open("../datasets/poor_amazon_toy_reviews.txt").readlines()
good = open("../datasets/good_amazon_toy_reviews.txt").readlines()

good_reviews = list(map(lambda review: (review, 1), good))
poor_reviews = list(map(lambda review: (review, 0), poor))

all_reviews = good_reviews + poor_reviews
all_reviews_df = pd.DataFrame(all_reviews, columns=["review", "positive"]).sample(10000)
all_reviews_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
X = vectorizer.fit_transform(all_reviews_df["review"])
y = all_reviews_df["positive"].values
X

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X.toarray(), y)

In [None]:
import numpy as np
y_pred = lr.predict(X)

# calculate accuracy
print(f"Training accuracy: {np.mean(y_pred == y)}")

from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

## Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X.toarray(), y)

In [None]:
import numpy as np
y_pred = nb.predict(X.toarray())

# calculate accuracy
print(f"Training accuracy: {np.mean(y_pred == y)}")

from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

## Using Train/Test Splits (Method 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2)

### Logistic Regression Performance

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# calculate accuracy
print(f"Training accuracy: {np.mean(y_pred == y_test)}")

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

## AUROC (Area Under the Receiver Operator Curve)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

In [None]:
data = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
data["TARGET"] = y

## Using Train/Test Splits (Method 2)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data)
X_train = train_df.loc[:, ~train_df.columns.isin(['TARGET'])]
X_test = test_df.loc[:, ~test_df.columns.isin(['TARGET'])]


y_train = train_df["TARGET"]
y_test = test_df["TARGET"]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

np.mean(y_pred == y_test)

## Cross Validation

In [None]:
from sklearn.model_selection import cross_validate
X = data.loc[:, ~data.columns.isin(['TARGET'])]
cv_results = cross_validate(lr, X, y, cv=10,return_train_score=False)

In [None]:
cv_results['test_score']