In [364]:
# Please note: you are only allowed to use the following packages:
# pandas, numpy, scikit-learn(sklearn), nltk
# You may not use other external data sources

import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [365]:
train = pd.read_csv("https://raw.githubusercontent.com/aiwei/inst414-21s/main/kaggle/train.csv")

In [366]:
train.head(5)

Unnamed: 0,label,text
0,1,"Henry Thomas showed a restraint, even when the..."
1,1,"This movie starts out brisk, has some slow mom..."
2,1,Castle of Blood is a good example of the quali...
3,1,I viewed the movie together with a homophobic ...
4,1,"The ""Men in White"" movie is definitely one of ..."


In [367]:
test = pd.read_csv("https://raw.githubusercontent.com/aiwei/inst414-21s/main/kaggle/test.csv")

In [368]:
test.head(5)

Unnamed: 0,Id,text
0,0,I cannot believe I actually sat through the wh...
1,1,I saw this one remastered on DVD. It had a big...
2,2,"Irrespective of the accuracy of facts, Bandit ..."
3,3,"Significant Spoilers! This is a sick, disturbi..."
4,4,If there are people that don't like this movie...


There are several ways to start with. First, you should think of a way to transform the text of a review to a feature vector, such that each dimension represents a word and the value represents the weight of that word in the review. You can also try different TF-IDF tricks to adjust the weightings. You may also consider adding bi-gram features as well. the `sklearn` package offers some ways to extract features from text, so let's play with one of them.

### Data Preprocessing

In [369]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()
lemmer = WordNetLemmatizer()
def preprocessor(review):
    words = nltk.word_tokenize(review)
    # Stemming
    stems = [stemmer.stem(word) for word in words]
    processed_reviews = ' '.join(stems)
    return processed_reviews

In [370]:
cont_vect = CountVectorizer(lowercase=True, stop_words='english', min_df=10, preprocessor=preprocessor,
                            ngram_range=(1, 2))

In [371]:
X_train = cont_vect.fit_transform(train.text)



In [372]:
X_train

<10000x16845 sparse matrix of type '<class 'numpy.int64'>'
	with 1071839 stored elements in Compressed Sparse Row format>

As the name suggest, we are transforming the reviews in the training set as a 10000 x 51704 matrix. THe number 51704 indicates that there are 51704 unique word in the training reviews. We can also limit the number of features in the matrix by setting the `max_features` when initiating the CountVectorizer.

In [373]:
X_test = cont_vect.transform(test.text)

In [374]:
y_train = train.label

You may notice that we are no longer constructing X_train, X_test, y_train, y_test using train_test_split. Obviously, the train-test split is now provided by a third-party and the y_test is hidden from you.

## Building Logistic Regression Classifier

In [375]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
model = LogisticRegression(penalty='l2', C=0.000091, solver='saga', l1_ratio=0.1, warm_start=True)

### Scaling Data

In [376]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [377]:
model.fit(X_train_scaled, y_train)



In [378]:
model_prediction = model.predict(X_test_scaled)

In [379]:
model_prediction_df = pd.DataFrame({"Id": test.Id, "Category": model_prediction})

In [380]:
model_prediction_df.head(10)

Unnamed: 0,Id,Category
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1
5,5,1
6,6,1
7,7,0
8,8,1
9,9,1


In [384]:
model_prediction_df.to_csv("LR_model_prediction.csv", index=False)

### Accuracy Testing

In [382]:
# Splitting 'train' dataframe into testing and training data
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train_scaled, y_train, test_size=0.5, random_state=42
)

# Fitting model with 50% of the training data
test_model = model
test_model.fit(X_train_split, y_train_split)

# Predicting accuracy
test_pred = test_model.predict(X_test_split)
accuracy = accuracy_score(y_test_split, test_pred)
print(f"Predicted accuracy: {accuracy}") 



Predicted accuracy: 0.866


### Hyperparameter Tuning

In [385]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

c_range = [0.000085, 0.000086, 0.000087, 0.000088, 0.000089, 0.00009, 0.000091, 0.000092, 0.000093, 0.000094, 0.000095, 0.000096, 0.000097, 0.000098, 0.000099] # Hyperparameter C
solvers = ['liblinear', 'sag', 'saga'] # Model solvers
penalties = ['l1', 'l2', 'elasticnet', 'none']

# Define parameter grid
param_grid = {
    'C': c_range,  # Inverse of regularization strength
    'solver': solvers,
    'penalty': penalties,
    'l1_ratio': [0.1, 0.5, 0.9],
    'warm_start': [True, False]
}

# Search for best parameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Print best C
best_C = grid_search.best_params_
print(f"Best parameters: {best_C}")

2700 fits failed out of a total of 5400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\wheff\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\wheff\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\wheff\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solve

Best parameters: {'C': 9.1e-05, 'l1_ratio': 0.5, 'penalty': 'l2', 'solver': 'saga', 'warm_start': True}
