In [39]:
import json # we need to use the JSON package to load the data, since the data is stored in JSON format
import numpy as np
import matplotlib.pyplot as plt
import timeit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [14]:
# Lowercase the text and split by whitespace
def preprocess_simple(data):
    data1 = data.copy()
    for data_point in data1:
        data_point["text"] = data_point["text"].lower().split()
    return data1

In [3]:
# Preprocessing using nltk package, tokenize, lemmatize, and remove unrelevant symbol
def preprocess_new_feature(data):
    data1 = data.copy()
    wordnet_lemmatizer = WordNetLemmatizer()
    for data_point in data:
        data_point["text"] = word_tokenize(data_point["text"])
        data_point["text"] = [w.lower() for w in data_point["text"]]
        data_point["text"] = [wordnet_lemmatizer.lemmatize(w) for w in data_point["text"]]
        data_point["text"] = [w for w in data_point["text"] if not w in ['.', ',', "'", '"', "?", "!", "[", "]", '(', ')', '-', '...'
           , "''", '``',":", " "]]
    return data1

In [4]:
# Split data to training, validation and testing sets.
def split_data(data):
    train_data = data[:10000]
    val_data = data[10000:11000]
    test_data = data[11000:]
    return train_data, val_data, test_data

In [18]:
# Return words count matrix
# num specify top-n words count return
def words_count(data, num=60):
    words_recurrence = {}
    for data_point in data:
        for word in data_point["text"]:
            if word in words_recurrence:
                words_recurrence[word] += 1
            else:
                words_recurrence[word] = 1
    words_recurrence = sorted(words_recurrence.items(), key=lambda kv: kv[1], reverse=True)
    words_recurrence = words_recurrence[:num]
    return words_recurrence

In [138]:
# Get feature matrix given data.
# Num specify the number of points.
def feature_extraction(data, num, words_recurrence):
    w = {}
    i = 0
    for word in words_recurrence:
        w[word[0]] = i
        i += 1
    feature_num = len(words_recurrence)
    x = np.zeros((num, feature_num))
    i = 0
    for data_point in data:
        for word in data_point["text"]:
            if word in w:
                x[i, w[word]] += 1
        i += 1
    return x;

In [7]:
# Add bias term to the data matrix
def add_bias(matrix):
    x_dataset_bias = np.ones((matrix.shape[0], matrix.shape[1]+1))
    x_dataset_bias[:,:-1] = matrix
    return x_dataset_bias

In [107]:
# Calculate the mean square error
def mse(predict, true):
    return (np.square(predict - true).mean(axis=None))

## Task 1

This cluster of code, will divided the data into training, validation and testing sets. and **print the top 160 words in the training sets.**

In [29]:
# Open the file
with open("proj1_data.json") as fp:
    data = json.load(fp)
data_processed = preprocess_simple(data)

# split data
train_data, val_data, test_data = split_data(data_processed)
words_top160 = words_count(train_data, num=160)
words_list = []
for word in words_top160:
    words_list.append(word[0])
words_list

['the',
 'i',
 'a',
 'to',
 'and',
 'of',
 'you',
 'that',
 'in',
 'it',
 'was',
 'is',
 'my',
 'for',
 'have',
 'but',
 'this',
 'with',
 'on',
 'not',
 'be',
 'just',
 'or',
 'if',
 'they',
 'your',
 'so',
 'like',
 'are',
 'at',
 'me',
 'as',
 'he',
 "it's",
 'she',
 "i'm",
 'about',
 'when',
 'we',
 'all',
 'because',
 'from',
 'her',
 'out',
 'would',
 'get',
 "don't",
 'had',
 'what',
 'one',
 'up',
 'people',
 'can',
 'an',
 'do',
 '[deleted]',
 'some',
 'how',
 'any',
 'really',
 'his',
 'then',
 'think',
 'no',
 'more',
 'there',
 'their',
 'who',
 'were',
 'will',
 'it.',
 'them',
 'by',
 'has',
 'only',
 'time',
 'go',
 'know',
 'even',
 'got',
 'been',
 'good',
 'new',
 'after',
 "you're",
 'never',
 'please',
 'make',
 'than',
 'am',
 'still',
 "that's",
 'him',
 'did',
 'being',
 'into',
 'much',
 "didn't",
 'other',
 'something',
 'going',
 'could',
 'post',
 'first',
 'way',
 'where',
 'also',
 'want',
 'over',
 '-',
 'most',
 "can't",
 'say',
 'see',
 'pretty',
 'why',

## Task2

### Closed form solution

In [102]:
# Closed form solution for linear regression
def closed_form(X, y):
    #return np.linalg.inv(np.transpose(X) @ X) @ np.transpose(X) @ y
    return np.dot(np.dot(np.linalg.inv(np.dot(np.transpose(X), X)), np.transpose(X)), y)

### Gradient descent solution

In [114]:
# Gradient descent solution for linear regression
def gradient_descent(X, y, beta, n0, eps, show=False):
    i = 1
    weight = np.random.rand(X.shape[1])
    a1 = np.transpose(X) @ X
    a2 = np.transpose(X) @ y
    while True:
        alpha = n0/((1+beta*i) * 10000)
        weight_new = weight - 2*alpha * (a1@weight - a2)
        difference = np.linalg.norm(weight_new - weight, ord=2)
        weight = weight_new
        i += 1   
        if show:
            if(i % 1000 == 0):
                print("Iteration: " + str(i) + ". Difference: " + str(difference))
        if (difference < eps):
            break
    return weight

## Task 3

construct matrix for 160 features

In [142]:
with open("proj1_data.json") as fp:
    data = json.load(fp)
data_processed = preprocess_simple(data)

# split data
train_data, val_data, test_data = split_data(data_processed)
words_top160 = words_count(train_data, num=160)
train_x_160 = add_bias(feature_extraction(train_data, 10000, words_top160))
val_x_160 = add_bias(feature_extraction(val_data, 1000, words_top160))
test_x_160 = add_bias(feature_extraction(test_data, 1000, words_top160))

construct matrix for 60 features

In [144]:
with open("proj1_data.json") as fp:
    data = json.load(fp)
data_processed = preprocess_simple(data)

# split data
train_data, val_data, test_data = split_data(data_processed)
words_top60 = words_count(train_data, num=60)
train_x_60 = add_bias(feature_extraction(train_data, 10000, words_top60))
val_x_60 = add_bias(feature_extraction(val_data, 1000, words_top60))
test_x_60 = add_bias(feature_extraction(test_data, 1000, words_top60))

construct matrix for 3 features

In [86]:
with open("proj1_data.json") as fp:
    data = json.load(fp)
x_3 = np.zeros((12000, 3))
i=0
for data_point in data:
    if data_point["is_root"]:
        x_3[i, 0] = 1
    else:
        x_3[i, 0] = 0
    x_3[i, 1] = data_point["controversiality"]
    x_3[i, 2] = data_point["children"]
    i = i + 1
train_x_3 = x_3[:10000]
val_x_3 = x_3[10000:11000]
test_x_3 = x_3[11000:]
train_x_3 = add_bias(train_x_3)
val_x_3 = add_bias(val_x_3)
test_x_3 = add_bias(test_x_3)

Construct y-matrix

In [117]:
with open("proj1_data.json") as fp:
    data = json.load(fp)
y = np.zeros(12000)
i = 0
for data_point in data:
    y[i] = data_point["popularity_score"]
    i += 1
y_train = y[:10000]
y_val = y[10000:11000]
y_test = y[11000:]

### Running time comparation

In [119]:
start = timeit.default_timer()
weights_cf = closed_form(train_x_3, y_train)
end = timeit.default_timer()
run_time_cf = end - start
start = timeit.default_timer()
weights_gd = gradient_descent(train_x_3, y_train, beta=10e-3, n0=10e-3, eps= 1 * 10e-07, show=True)
end = timeit.default_timer()
run_time_gd = end - start

Iteration: 1000. Difference: 0.00015673044977630802
Iteration: 2000. Difference: 6.472833334567594e-05
Iteration: 3000. Difference: 3.8169431291888766e-05
Iteration: 4000. Difference: 2.619291552084706e-05
Iteration: 5000. Difference: 1.955659561260473e-05
Iteration: 6000. Difference: 1.5408475534209258e-05
Iteration: 7000. Difference: 1.260123544594461e-05
Iteration: 8000. Difference: 1.0591127106886764e-05
Iteration: 9000. Difference: 9.089756532183915e-06
Iteration: 10000. Difference: 7.931015766893505e-06
Iteration: 11000. Difference: 7.012988581357301e-06
Iteration: 12000. Difference: 6.26993372314814e-06
Iteration: 13000. Difference: 5.6576842529304175e-06
Iteration: 14000. Difference: 5.145549775031771e-06
Iteration: 15000. Difference: 4.7115894529005165e-06
Iteration: 16000. Difference: 4.339731842562988e-06
Iteration: 17000. Difference: 4.017954853829322e-06
Iteration: 18000. Difference: 3.737098949840631e-06
Iteration: 19000. Difference: 3.4900718205898256e-06
Iteration: 2000

In [120]:
print("Running time for closed form with 3 features is: " + str(run_time_cf) + "s.")
print("Running time for gradient descent with 3 features is : " + str(run_time_gd) + "s.")

Running time for closed form with 3 features is: 0.00058034659605255s.
Running time for gradient descent with 3 features is : 0.4716558564441584s.


### MSE comparation

In [122]:
mse_cf = mse(np.dot(val_x_3, weights_cf), y_val)
mse_gd = mse(np.dot(val_x_3, weights_gd), y_val)

In [123]:
print("MSE for closed form with 3 features is: " + str(mse_cf))
print("MSE for gradient descent with 3 features is: " + str(mse_gd))

MSE for closed form with 3 features is: 1.0203266848431447
MSE for gradient descent with 3 features is: 1.0511459440423714


### Stability

In [124]:
print("Run 5 times for closed form:")
for i in range(5):
    weights_cf = closed_form(train_x_3, y_train)
    print(weights_cf)
print("Run 5 times for gradient descent (same hyperparameter):")
for i in range(5):
    weights_gd = gradient_descent(train_x_3, y_train, beta=10e-3, n0=10e-3, eps= 1 * 10e-07)
    print(weights_gd)

Run 5 times for closed form:
[-0.22627679 -1.08584747  0.37536403  0.82092517]
[-0.22627679 -1.08584747  0.37536403  0.82092517]
[-0.22627679 -1.08584747  0.37536403  0.82092517]
[-0.22627679 -1.08584747  0.37536403  0.82092517]
[-0.22627679 -1.08584747  0.37536403  0.82092517]
Run 5 times for gradient descent (same hyperparameter):
[-0.12245861  0.3295951   0.37644725  0.74988146]
[-0.16823466  0.40122809  0.37391072  0.77429809]
[-0.13625649  0.14536742  0.3762923   0.75928355]
[-0.142403    0.37862931  0.3752892   0.76034302]
[-0.15684602  0.18040528  0.37514236  0.77023957]


### Performance comparasion for different feature

We use **closed form** in our comparasion.

In [145]:
weights_3 = closed_form(train_x_3, y_train)
weights_60 = closed_form(train_x_60, y_train)
weights_160 = closed_form(train_x_160, y_train)
mse_3_train = mse(np.dot(train_x_3, weights_3), y_train)
mse_3_val = mse(np.dot(val_x_3, weights_3), y_val)
mse_60_train = mse(np.dot(train_x_60, weights_60), y_train)
mse_60_val = mse(np.dot(val_x_60, weights_60), y_val)
mse_160_train = mse(np.dot(train_x_160, weights_160), y_train)
mse_160_val = mse(np.dot(val_x_160, weights_160), y_val)

Compare mse for train sets, validation sets for different feature

In [146]:
print("MSE for training sets with no text feature:" + str(mse_3_train))
print("MSE for validation sets with no text feature:" + str(mse_3_val))
print("MSE for training sets with top 60 words:" + str(mse_60_train))
print("MSE for validation sets with top 60 words:" + str(mse_60_val))
print("MSE for training sets with top 160 words:" + str(mse_160_train))
print("MSE for validation sets with top 160 words:" + str(mse_160_val))

MSE for training sets with no text feature:1.0846830709157251
MSE for validation sets with no text feature:1.0203266848431447
MSE for training sets with top 60 words:1.3357113742096964
MSE for validation sets with top 60 words:1.2652963638167056
MSE for training sets with top 160 words:1.3179296503478746
MSE for validation sets with top 160 words:1.2917631409955967
