This is a lite version of our logistic regression model. You can get 0.7470094610831955 cross validation mean test RMSE using the code below. The prediction file gives 0.73028 score on Kaggle.

If you want more detalied, and powerful model, check out [here](https://github.com/xiaohk/stat333_project_2).

# Functions

In [17]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, Ridge
from pickle import load, dump
from os.path import exists
import time

# Change THREAD to change model and prediction name
THREAD = '_1~10'

# If you download the full data set, you can use our pre-stored
# data files

"""
TRAIN_TEXT = '../../data/text_train.csv'
TEST_TEXT = '../../data/text_test.csv'
VALI_TEXT = '../../data/text_validate.csv'
MINI_TEXT = '../../data/text_mini.csv'
PREDICTION_CSV = '../../static/prediction.csv'
VECTOR = '../../static/tf_vector.pickle'
MATRIX = '../../static/tf_matrix.pickle'
PREDICTION = './config/prediction{}.csv'.format(THREAD)
MODEL = './config/model{}.pickle'.format(THREAD)
"""

STOP = set(stopwords.words("english"))
STEMMER = SnowballStemmer("english")
LOAD_NEW = False

# Tunable parameters
MAX_DF = 1.0  # We already have stop words, probably don't need this
MIN_DF = 2  # Discard words which not show up twice in all document
MAX_FEATURE = None  # IF no memory, tune this down
MIN_N = 1
MAX_N = 1  # Uni-gram

DUAL = False  # Feature > sample then true

In [2]:
def tokenize(text):
    """ Helper function for TFIDF vector from sklearn. We use nltk's tokenize
        function here. Also use snowball as stemmer (the middle agressive
        one).
    """
    # Filter out stop words, tokenize the text
    useful_token = [w.lower() for w in word_tokenize(text) if w not in STOP]

    # Stemming the tokens
    stemmed_token = [STEMMER.stem(t) for t in useful_token]

    return stemmed_token

In [3]:
def vectorize_text(train_txt, vali_txt, test_txt, vector, matrix, re_load,
                   min_df=1, max_df=1.0, max_feature=None, min_n=1, max_n=1):
    """ Feature engineering from the raw text input. """
    # If there is saved model, then just use it
    if exists(vector) and exists(matrix) and not re_load:
        # Get train length
        table_train = pd.read_csv(train_txt)

        # Load stored data
        all_mat = load(open(matrix, 'rb'))
        x_train = all_mat[:table_train.shape[0]]
        tf = load(open(vector, 'rb'))

    else:
        # Read all files
        table_train = pd.read_csv(train_txt)
        table_test = pd.read_csv(vali_txt)
        table_vali = pd.read_csv(test_txt)

        text_train = table_train['clean'].tolist()
        text_test = table_test['clean'].tolist()
        text_vali = table_vali['clean'].tolist()

        # We want to have a overall vocabulary bank as `np.py`, so we combine
        # all the text first
        all_text = text_train + text_test + text_vali

        # Record the length so we can recover the training set
        train_length = len(text_train)

        # Initialize TFID arguments
        # Only work for English, discard all Chinese
        tf = TfidfVectorizer(min_df=min_df, max_features=max_feature,
                             strip_accents='ascii', analyzer='word',
                             tokenizer=tokenize, ngram_range=(min_n, max_n))

        # Vectorize all, and transform (more efficient than fit + transform)
        all_mat = tf.fit_transform(all_text)

        # Recover the training data
        x_train = all_mat[:train_length]

        # Store the fitted matrix and tf_vectorizor
        dump(all_mat, open(matrix, 'wb'))
        dump(tf, open(vector, 'wb'))

    print("Successfully load TF-IDF matrix, with shape {}.".format(
        x_train.shape))

    return tf, all_mat, x_train

In [4]:
def score(estimator, x_test, y_test):
    """ Use mean squared error as score for cv."""
    probs = estimator.predict_proba(x_test)
    result = np.zeros(x_test.shape[0])
    for i in range(probs.shape[0]):
        result[i] = dis_to_conti(probs[i])
    y_int = np.array(list(map(int, y_test)))
    # We want to minimize the error
    score = (-1) * np.sqrt(np.mean(np.square(result - y_int)))
    return score


def dis_to_conti(probability):
    """ The kaggle grading is unfair, so I want to force bayesian classifier
        gives a continuous result.
    """
    return sum(probability * np.arange(1, 6))

def score_mlr(estimator, x_test, y_test):
    """ Use mean squared error as score for cv."""
    result = estimator.predict(x_test)
    y_int = np.array(list(map(int, y_test)))

    # We want to minimize the error
    score = (-1) * np.sqrt(np.mean(np.square(result - y_int)))
    return score

In [5]:
def train_lr(x_train, y_train, model_name):
    """ Train a logistic regression model."""
    # Use cross validation to search features
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    # param_grid = {'C': [1, 5]}
    best_model = GridSearchCV(LogisticRegression(penalty=PENALTY, dual=DUAL),
                              param_grid, scoring=score, cv=10)

    best_model.fit(x_train, y_train)
    print(best_model.best_estimator_)

    # Save the model
    dump(best_model, open(model_name, 'wb'))

    return best_model

In [6]:
def predict(estimator, all_matrix, train_length, output_csv):
    """ Predict the test and validation text, and write to csv."""
    # Read the text
    # `all_matrix` has already contained all the test and vali text
    x_predict = all_matrix[train_length:]
    print("Successfully load predicting text, with shape {}.".format(
        x_predict.shape))

    prediction = estimator.predict_proba(x_predict)

    # Convert probability to continuous scores
    result = np.zeros(x_predict.shape[0])
    for i in range(prediction.shape[0]):
        result[i] = dis_to_conti(prediction[i])

    # Combine ID and write to a file
    with open(output_csv, 'w') as output:
        output.write('"Id","Prediction"\n')
        for i in range(len(result)):
            # The id number is 1-indexed
            output.write("{},{}\n".format(i + 1, result[i]))

def predict_mlr(estimator, all_matrix, train_length, output_csv):
    """ Predict the test and validation text, and write to csv. The estimator
        should be a prediction estimator, instead of a classifier.
    """
    # Read the text
    # `all_matrix` has already contained all the test and vali text
    x_predict = all_matrix[train_length:]
    print("Successfully load predicting text, with shape {}.".format(
        x_predict.shape))

    prediction = estimator.predict(x_predict)

    # Combine ID and write to a file
    with open(output_csv, 'w') as output:
        output.write('"Id","Prediction"\n')
        for i in range(len(prediction)):
            output.write("{},{}\n".format(i, prediction[i]))

# Make TF-IDF matrix
Need train: text + label + clean, test: text + clean, validation: text + clean.

To generate those files from given yelp data, please use `project/data/parser.py`

In [7]:
start_time = time.time()

print("Start vectorizing...")

# We disabled the load new feature here, so we make new matrix everytime
tf, all_mat, x_train = vectorize_text('./lite/text_train.csv', 
                                      './lite/text_test.csv', 
                                      './lite/text_validate.csv',
                                      './lite/tf.pickle', 
                                      './lite/matrix.pickle', 
                                      True,
                                      min_df=MIN_DF, max_df=MAX_DF,
                                      max_feature=MAX_FEATURE,
                                      min_n=MIN_N, max_n=MAX_N)

# Make label for train_v
table_train = pd.read_csv(TRAIN_TEXT)

# Use string to represent the categories
y_train = list(map(str, table_train['stars']))

print("--- Used %s seconds ---" % (time.time() - start_time))

Start vectorizing...
Successfully load TF-IDF matrix, with shape (36692, 21888).
--- Used 163.23979306221008 seconds ---


In [8]:
# It contains all vectorized train, test, vali data in order
all_mat.shape

(61153, 21888)

# Train Our Model

In [12]:
start_time = time.time()

model = LogisticRegression(C=2.387755102040816, penalty='l1', n_jobs=4)
model.fit(x_train, y_train)

print("--- Used %s seconds ---" % (time.time() - start_time))

--- Used 8.039577007293701 seconds ---


# Make Prediction

In [14]:
start_time = time.time()

predict(model, all_mat, x_train.shape[0], './lite/prediction.csv')

print("--- Used %s seconds ---" % (time.time() - start_time))

Successfully load predicting text, with shape (24461, 21888).
--- Used 0.2222599983215332 seconds ---


# Evaluate Model

In [16]:
start_time = time.time()

print("5-CV average accuracy: {}".format(
        np.mean(cross_val_score(model, x_train, y_train, cv=5,
                                scoring=score, n_jobs=4, verbose=2))))

print("--- Used %s seconds ---" % (time.time() - start_time))

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=  13.9s
[CV]  ................................................................
[CV] ................................................. , total=  14.0s
[CV] ................................................. , total=  14.1s
[CV] ................................................. , total=  14.2s
[CV] ................................................. , total=   6.1s
5-CV average accuracy: -0.7470094610831955
--- Used 20.465030908584595 seconds ---


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   20.3s finished


Then you get **0.7470094610831955** cross validation mean test RMSE! 

# If you want more

Please check 'project/model/lr/tuning.ipynb', the best model should be unibram + one time ridge + C=6 + less aggressive stop word.