In [1]:
# Please note: you are only allowed to use the following packages:
# pandas, numpy, scikit-learn(sklearn), nltk
# You may not use other external data sources

import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv("https://raw.githubusercontent.com/aiwei/inst414-21s/main/kaggle/train.csv")

In [3]:
train.head(5)

Unnamed: 0,label,text
0,1,"Henry Thomas showed a restraint, even when the..."
1,1,"This movie starts out brisk, has some slow mom..."
2,1,Castle of Blood is a good example of the quali...
3,1,I viewed the movie together with a homophobic ...
4,1,"The ""Men in White"" movie is definitely one of ..."


In [4]:
test = pd.read_csv("https://raw.githubusercontent.com/aiwei/inst414-21s/main/kaggle/test.csv")

In [5]:
test.head(5)

Unnamed: 0,Id,text
0,0,I cannot believe I actually sat through the wh...
1,1,I saw this one remastered on DVD. It had a big...
2,2,"Irrespective of the accuracy of facts, Bandit ..."
3,3,"Significant Spoilers! This is a sick, disturbi..."
4,4,If there are people that don't like this movie...


There are several ways to start with. First, you should think of a way to transform the text of a review to a feature vector, such that each dimension represents a word and the value represents the weight of that word in the review. You can also try different TF-IDF tricks to adjust the weightings. You may also consider adding bi-gram features as well. the `sklearn` package offers some ways to extract features from text, so let's play with one of them.

### Data Preprocessing

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmer = WordNetLemmatizer()
def preprocessor(review):
    words = nltk.word_tokenize(review)
    # Lemmatization
    lemm = [lemmer.lemmatize(word) for word in words]
    processed_reviews = ' '.join(lemm)
    return processed_reviews

In [7]:
cont_vect = CountVectorizer(lowercase=True, stop_words='english', min_df=10, preprocessor=preprocessor,
                            ngram_range=(1, 2))

In [8]:
X_train = cont_vect.fit_transform(train.text)



In [9]:
X_train

<10000x17769 sparse matrix of type '<class 'numpy.int64'>'
	with 1024171 stored elements in Compressed Sparse Row format>

As the name suggest, we are transforming the reviews in the training set as a 10000 x 51704 matrix. THe number 51704 indicates that there are 51704 unique word in the training reviews. We can also limit the number of features in the matrix by setting the `max_features` when initiating the CountVectorizer.

In [10]:
X_test = cont_vect.transform(test.text)

In [11]:
y_train = train.label

You may notice that we are no longer constructing X_train, X_test, y_train, y_test using train_test_split. Obviously, the train-test split is now provided by a third-party and the y_test is hidden from you.

## Building a Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
model = LogisticRegression()

In [13]:
model.fit(X_train, y_train)

In [14]:
model_prediction = model.predict(X_test)

In [15]:
model_prediction_df = pd.DataFrame({"Id": test.Id, "Category": model_prediction})

In [16]:
model_prediction_df.head(10)

Unnamed: 0,Id,Category
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1
5,5,1
6,6,0
7,7,1
8,8,1
9,9,1


In [17]:
# model_prediction_df.to_csv("stem_SVM_model_prediction.csv", index=False)

### Accuracy Testing

In [18]:
# Splitting 'train' dataframe into testing and training data
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42
)

# Fitting model with 50% of the training data
test_model = model
test_model.fit(X_train_split, y_train_split)

# Predicting accuracy
test_pred = test_model.predict(X_test_split)
accuracy = accuracy_score(y_test_split, test_pred)
print(f"Predicted accuracy: {accuracy}") 

Predicted accuracy: 0.8452


### Hyperparameter Tuning