In [1]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, BatchNormalization, Activation, Input, Add, Concatenate
from keras_layer_normalization import LayerNormalization
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Activation, Input, \
    Conv1D, MaxPool1D, Flatten, Concatenate, Add, MaxPooling1D,LSTM
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

Using TensorFlow backend.


In [2]:
def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of reviews, a list of labels
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)

    return df["review_id"], df["text"], df["stars"]

def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)["stars"]

def write_predictions(file_name, pred):
    df = pd.DataFrame(zip(range(len(pred)), pred))
    df.columns = ["review_id", "stars"]
    df.to_csv(file_name, index=False)

In [3]:
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type|: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results
    
def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

In [4]:
def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out, type: int
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out, type: int
    :param max_size: the max size of feature dict, type: int
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = [f for f, cnt in feat_cnt.most_common(max_size)]
    else:
        valid_feats = list()
        for f, cnt in feat_cnt.most_common():
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        valid_feats = valid_feats[:max_size]        
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict

def get_onehot_vector(feats, feats_dict):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float32)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

def get_count_vector(feats, feats_dict): 
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float32)
    for f in feats:
            # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1) 
        if f_idx != -1:
                # add the corresponding element by 1
                vector[f_idx] += 1 
    return vector

def get_idf_dict(feats_list): 
    """
    :param feats_list: a list of lists of features, type: list(list) return an idf vector,
    """
    N = len(feats_list)
    df_dict = Counter()
    for feats in feats_list:
        df_dict.update(set(feats))
        # IDF: log(1 + N/n)
        idf_dict = {f: math.log2(1+N/cnt) for f, cnt in df_dict.items()}
    return idf_dict

def get_tfidf_vector(feats, feats_dict, idf_dict): 
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict :param idf_dict: a dict from features to idf, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float32)
    # TF: 1 + log(f)
    tf_dict = {f: 1+math.log2(cnt) for f, cnt in Counter(feats).items()} 
    for f in tf_dict:
            # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1) 
        if f_idx != -1:
            tf = tf_dict[f]
            idf = idf_dict[f]
            # set the corresponding element as tf*idf 
            vector[f_idx] = tf*idf
    return vector

In [5]:
train_file = "data/train.csv"
test_file = "data/valid.csv"
#ans_file = "data/ans.csv"
pred_file = "data/pred.csv"
min_freq = 3

# load data
train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(test_file)

# extract features
train_tokens = [tokenize(text) for text in train_texts]
test_tokens = [tokenize(text) for text in test_texts]

train_stemmed = [stem(tokens) for tokens in train_tokens]
test_stemmed = [stem(tokens) for tokens in test_tokens]

train_stemmed = [filter_stopwords(tokens) for tokens in train_stemmed]
test_stemmed = [filter_stopwords(tokens) for tokens in test_stemmed]

train_2_gram = [n_gram(tokens, 2) for tokens in train_stemmed]
train_3_gram = [n_gram(tokens, 3) for tokens in train_stemmed]
test_2_gram = [n_gram(tokens, 2) for tokens in test_stemmed]
test_3_gram = [n_gram(tokens, 3) for tokens in test_stemmed]

# build the feature list
train_feats = list()
for i in range(len(train_ids)):
    train_feats.append(
        train_stemmed[i] + train_2_gram[i])
test_feats = list()
for i in range(len(test_ids)):
    test_feats.append(
        test_stemmed[i] + test_2_gram[i])

# build a mapping from features to indices
feats_dict = get_feats_dict(
    chain.from_iterable(train_feats),
    min_freq=5)

train_feats_matrix = np.vstack(
    [get_onehot_vector(f, feats_dict) for f in train_feats])
test_feats_matrix = np.vstack(
    [get_onehot_vector(f, feats_dict) for f in test_feats])

# convert labels to label_matrix
num_classes = max(train_labels)
# convert each label to a ont-hot vector, and then stack vectors as a matrix
train_label_matrix = keras.utils.to_categorical(train_labels-1, num_classes=num_classes)
test_label_matrix = keras.utils.to_categorical(test_labels-1, num_classes=num_classes)

Size of features: 50501


In [6]:
embedding_matrixx = np.zeros((len(feats_dict), 100), dtype=np.float32)
with open('models/glove.6B.100d.txt',  encoding='utf8') as f:

    while True:
        line = f.readline()
        if not line:
            break
        word, vec = line.split(" ", 1)
        word_idx = feats_dict.get(word, -1)
        if word_idx != -1:
            embedding_matrixx[word_idx] = np.array(vec.split(), dtype=np.float32)

In [7]:
embedding_matrix = np.zeros((len(feats_dict), 100), dtype=np.float32)
with open("models/word2vec.vec", "r") as f:
    n_words, n_dim = f.readline().split()
    n_words, n_dim = int(n_words), int(n_dim) 
    print("number of words:", n_words, "word dimension:", n_dim)
    while True:
        line = f.readline()
        if not line:
            break
        word, vec = line.split(" ", 1)
        word_idx = feats_dict.get(word, -1)
        if word_idx != -1:
            embedding_matrix[word_idx] = np.array(vec.split(), dtype=np.float32)
max_len = 100

number of words: 3970 word dimension: 100


In [8]:
def build_Res_Net(input_size, vocab_size, embedding_size, 
                 output_size, num_layers, hidden_size,
              activation="relu",
              dropout_rate=0.0,
              batch_norm=False,
              layer_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.1,
              metric="accuracy",embedding_matrix = embedding_matrixx):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_layers: the number of layers, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param layer_norm: whether to enable layer normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a multi-layer network with residual connections,
    # activation
    # dropout document: https://keras.io/layers/core/#dropout
    # batch normalization document: https://keras.io/layers/normalization/
    # layer normalization: https://github.com/CyberZHG/keras-layer-normalization
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_size,))

    y = Embedding(input_dim=vocab_size,
                        output_dim=embedding_size,
                        input_length=input_size,
                        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                        trainable=True)(x)
    if num_layers == 1:
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=input_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros",
                  kernel_regularizer=keras.regularizers.l2(l2_reg))(y)
    else:
        h = x
        for i in range(num_layers-1):
            if i == 0:
                # fitst layer: input -> hidden
                new_h = Dense(hidden_size,
                          input_dim=input_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            else:
                new_h = Dense(hidden_size,
                          input_dim=hidden_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            # add layer_norm
            if layer_norm:
                new_h = LayerNormalization()(new_h)
            # add batch_norm
            if batch_norm:
                new_h = BatchNormalization()(new_h)
            # residual connection
            if i == 0:
                h = new_h
            else:
                h = Add()([h, new_h])
            # add activation
            h = Activation(activation)(h)
            # add dropout here (set seed as 0 in order to reproduce)
            if dropout_rate > 0.0:
                h = Dropout(dropout_rate, seed=0)(h)
        # last layer: hidden -> class
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=hidden_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [9]:
model = build_Res_Net(input_size=len(feats_dict), vocab_size = len(feats_dict),embedding_size=100,output_size=num_classes,
                  num_layers=3, hidden_size=100, activation="relu",
                  l2_reg=0.006, dropout_rate=0.3)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_seed(0)
res_history = model.fit(train_feats_matrix, train_label_matrix,
                        validation_split=0.1,
                        epochs=20, batch_size=100, verbose=0,
                        callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 1.2378405725955963 training accuracy 0.7918000221252441
test loss: 1.4425039887428284 test accuracy 0.6629999876022339


In [10]:
# save predictions
pred_file = "data/pred.csv"
#test_pred = model.predict_classes(test_feats_matrix)
test_pred = model.predict(test_feats_matrix)
test_pred = np.argmax(test_pred,axis=1)
write_predictions(pred_file, test_pred)
ans = pd.read_csv("data/valid.csv", usecols=["review_id", "stars"])
pred = pd.read_csv("data/pred.csv", usecols=["review_id", "stars"])
df = pd.merge(ans, pred, how="left", on=["review_id"])
df.fillna(0, inplace=True)
acc = accuracy_score(df["stars_x"], df["stars_y"])
p, r, f1, _ = precision_recall_fscore_support(df["stars_x"], df["stars_y"], average="macro")
print("accuracy:", acc, "\tprecision:", p, "\trecall:", r, "\tf1:", f1)


ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat