In [1]:
import json
import nltk
import re
import itertools
from time import sleep
import sys
import numpy
import scipy
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
with open("proj1_data.json") as fp:
    data = json.load(fp)

In [2]:
train = data[:10000]
validation = data[10000:11000]
test = data[11000:12000]

In [3]:
def countAllFrequencyNaive(dataSet): 
    totalString = str()
    for d in dataSet: 
        totalString = totalString + ' ' + d['text']
    countNaive = Counter([s for s in totalString.lower().strip().split()])
    totalCount = []
    return list(map(lambda v: v[0], countNaive.most_common(160)))

In [4]:
def countAllFrequencyStopWord(dataSet): 
    
    totalString = str()
    
    for d in dataSet: 
        totalString = totalString + ' ' + d['text']
        
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(totalString)
    
    stopwordsSet = set(stopwords.words())
    
    countDict = Counter(s.lower() for s in withoutPunc if s.lower() not in stopwordsSet)
    
    return list(map(lambda v: v[0], countDict.most_common(160)))

In [5]:
# word count, no removal of punctuations
# @Param: singleText: text to process, numberOfFeatures: 0, 60, 160?
# @Return: vector of count (x in description)
def wordCountNaive(singleText, numberOfFeatures, totalCount): 
    
    countNaive = Counter([s.lower() for s in singleText.split()])
    returnVector = []
    for word in totalCount[:numberOfFeatures]: 
        returnVector.append(float(countNaive[word]))
    return returnVector


# word count, remove punc and stopwords to imporve model
# @Param: singleText: text to process, numberOfFeatures: 0, 60, 160?
# @Return: vector of count (x in description)
def wordCountWithStopwords(singleText, numberOfFeatures, totalCount): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    
    stopwordsSet = set(stopwords.words())
    
    countDict = Counter([s.lower() for s in withoutPunc if s.lower() not in stopwordsSet])
    
    returnVector = []
    for word in totalCount[:numberOfFeatures]: 
        returnVector.append(float(countDict[word]))
    
    return returnVector

In [6]:
def pythonListTranspose(xl): 
    return list(map(list, itertools.zip_longest(*xl)))

In [7]:
# Feature Parser
# @Param: dataVector: sliced original dataset, wordCountFunction: <str> -> ndarray<float>, 
# wordPOSFunction: str -> list<float>, numberOfTextFeature: 0 to shut down Text Processing (Text Features)
# featureType: 
# @Return: tuple<ndarray, list>: xEngineered, yExtracted: rows: vector<samplePoint>, samplePoint[0->2]: basic Features, 
# samplePoint[3->162]: text, samplePoint[163->167]: extra
def parseFeatures(dataVector, wordCountFunction, numberOfTextFeatures, featureType): 
    y = []
    childrenFeature = []
    controversialityFeature = []
    isRootFeature = []
    processedTextFeature = []
    verbFeature = []
    nounFeature = []
    adjFeature = []
    urlFeature = []
    identityFeature = []
    c = 0
    lenV = len(dataVector)
    for dataPoint in dataVector: 
        
        # Basic Features
        y.append(float(dataPoint['popularity_score']))
        if 'children' in featureType: 
            childrenFeature.append(float(dataPoint['children']))
        if 'controv' in featureType: 
            controversialityFeature.append(float(dataPoint['controversiality']))
        identityFeature.append(1.0)
        isRootVar = -1.0
        if 'isRoot' in featureType: 
            if dataPoint['is_root'] == True: 
                isRootVar = 1.0
                isRootFeature.append(isRootVar)
            else: 
                isRootVar = 0.0
                isRootFeature.append(isRootVar)
        
        # Text Features: 0 to shut down text feature
        if numberOfTextFeatures > 0 and 'text' in featureType: 
            processedTextFeature.append(wordCountFunction(dataPoint['text'], numberOfTextFeatures))
        
        # Extra Features
        if 'noun' in featureType or 'verb' in featureType or 'adj' in featureType: 
            wordAnalysis = wordPOSCountWithStopwords(dataPoint['text'])
        if 'verb' in featureType: 
            verbFeature.append(wordAnalysis[0])
        if 'noun' in featureType: 
            nounFeature.append(wordAnalysis[1])
        if 'adj' in featureType: 
            adjFeature.append(wordAnalysis[2])
        if 'url' in featureType: 
            urlFeature.append(hasURL(dataPoint['text']))
        
        # Process Bar
        sys.stdout.write('\r')
        count = int((float(c) / float(lenV)) * 100)
        sys.stdout.write("[%-20s] %d%%" % ('='*int(count / 5), count))
        sleep(0.001)
        sys.stdout.flush()
        c = c + 1
    if len(processedTextFeature) > 0: 
        processedTextFeature = pythonListTranspose(processedTextFeature)
    returnTotal = [childrenFeature, controversialityFeature, isRootFeature, verbFeature, nounFeature, adjFeature, urlFeature, identityFeature] + processedTextFeature
    return pythonListTranspose([x for x in returnTotal if len(x) > 0]), pythonListTranspose([y])

In [8]:
def meanSquareError(valX, valY, w): 
    diffenence = numpy.power(numpy.transpose(numpy.subtract(valY, numpy.matmul(valX, w)))[0], 2)
    return numpy.divide(numpy.sum(diffenence), len(valY))

In [9]:
def closedFormLinearRegression(x, y): 
    xT = numpy.transpose(numpy.array(x))
    return numpy.matmul(numpy.matmul(scipy.linalg.inv(numpy.matmul(xT, numpy.array(x))), xT), numpy.array(y))

In [10]:
def gradientDescentLinearRegression(learnRateFunction, x, y, tol): 
    i = 1
    weight = numpy.array([[0.0] for l in range(len(x[0]))])
    weightN = numpy.array([[0.0] for l in range(len(x[0]))])
    xT = numpy.transpose(x)
    xTx = numpy.matmul(xT, x)
    xTy = numpy.matmul(xT, y)
    while True: 
        weight = weightN
        weightN = numpy.subtract(weight, 2 * learnRateFunction(i) * numpy.subtract(numpy.matmul(xTx, weight), xTy))
        i = i + 1
        if numpy.linalg.norm(numpy.subtract(weightN, weight), 2) <= tol: 
            break
    return weightN

In [11]:
tFreq = countAllFrequencyNaive(train)
vFreq = countAllFrequencyNaive(validation)
trainFeatures = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 0, ['children', 'controv', 'isRoot'])
validationFeatures = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, vFreq), 0, ['children', 'controv', 'isRoot'])



In [12]:
resultClosed = closedFormLinearRegression(trainFeatures[0], trainFeatures[1])
errorClosed = meanSquareError(validationFeatures[0], validationFeatures[1], resultClosed)
errorTrained = meanSquareError(trainFeatures[0], trainFeatures[1], resultClosed)
print('result of closed form: \n', resultClosed)
print('error of closed form: \n', errorClosed)
print('error of closed trained: \n', errorTrained)

result of closed form: 
 [[ 0.37536403]
 [-1.08584747]
 [-0.22627679]
 [ 0.82092517]]
error of closed form: 
 1.0203266848431447
error of closed trained: 
 1.0846830709157251


In [13]:
tFreq = countAllFrequencyNaive(train)
trainFeatures60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'text'])
trainFeatures160 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 160, ['children', 'controv', 'isRoot', 'text'])



In [14]:
vFreq = countAllFrequencyNaive(validation)
validationFeatures60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, vFreq), 60, ['children', 'controv', 'isRoot', 'text'])
validationFeatures160 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, vFreq), 160, ['children', 'controv', 'isRoot', 'text'])



In [15]:
resultClosed60 = closedFormLinearRegression(trainFeatures60[0], trainFeatures60[1])
errorClosed60 = meanSquareError(validationFeatures60[0], validationFeatures60[1], resultClosed60)
errorTrained60 = meanSquareError(trainFeatures60[0], trainFeatures60[1], resultClosed60)
# print('result of closed form: \n', resultClosed60)
print('error of closed form: \n', errorClosed60)
print('error of closed trained: \n', errorTrained60)

error of closed form: 
 1.0558077809463702
error of closed trained: 
 1.060429141685383


In [16]:
resultClosed160 = closedFormLinearRegression(trainFeatures160[0], trainFeatures160[1])
errorClosed160 = meanSquareError(validationFeatures160[0], validationFeatures160[1], resultClosed160)
errorTrained160 = meanSquareError(trainFeatures160[0], trainFeatures160[1], resultClosed160)
# print('result of closed form: \n', resultClosed160)
print('error of closed form: \n', errorClosed160)
print('error of closed trained: \n', errorTrained160)
len(trainFeatures160[1])

error of closed form: 
 1.0686390774956738
error of closed trained: 
 1.0477763217987115


10000

In [17]:
tFreqS = countAllFrequencyStopWord(train)
trainFeatures60S = parseFeatures(data[:10000], lambda u, v: wordCountWithStopwords(u, v, tFreqS), 60, ['children', 'controv', 'isRoot', 'text'])
trainFeatures160S = parseFeatures(data[:10000], lambda u, v: wordCountWithStopwords(u, v, tFreqS), 160, ['children', 'controv', 'isRoot', 'text'])



In [18]:
vFreqS = countAllFrequencyStopWord(validation)
validationFeatures60S = parseFeatures(validation, lambda u, v: wordCountWithStopwords(u, v, vFreqS), 60, ['children', 'controv', 'isRoot', 'text'])
validationFeatures160S = parseFeatures(validation, lambda u, v: wordCountWithStopwords(u, v, vFreqS), 160, ['children', 'controv', 'isRoot', 'text'])



In [19]:
resultClosed60S = closedFormLinearRegression(trainFeatures60S[0], trainFeatures60S[1])
errorClosed60S = meanSquareError(validationFeatures60S[0], validationFeatures60S[1], resultClosed60S)
errorTrained60S = meanSquareError(trainFeatures60S[0], trainFeatures60S[1], resultClosed60S)
# print('result of closed form: \n', resultClosed60S)
print('error of closed form: \n', errorClosed60S)
print('error of closed trained: \n', errorTrained60S)

error of closed form: 
 1.07058053600855
error of closed trained: 
 1.0690435344142226


In [20]:
resultClosed160S = closedFormLinearRegression(trainFeatures160S[0], trainFeatures160S[1])
errorClosed160S = meanSquareError(validationFeatures160S[0], validationFeatures160S[1], resultClosed160S)
errorTrained160S = meanSquareError(trainFeatures160S[0], trainFeatures160S[1], resultClosed160S)
# print('result of closed form: \n', resultClosed160S)
print('error of closed form: \n', errorClosed160S)
print('error of closed trained: \n', errorTrained160S)
len(trainFeatures160[1])

error of closed form: 
 1.1817415713223627
error of closed trained: 
 1.0504238189332942


10000

In [21]:
# word pos count, remove punc and stopwords to imporve model
# @Param: singleText: text to process
# @Return: vector of [verbcount, nouncount, adjcount]
def wordPOSCountWithStopwords(singleText): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    setOfStop = set(stopwords.words())
    tagged = nltk.pos_tag([s.lower() for s in withoutPunc if s.lower() not in setOfStop])
    
    verbTotal, nounTotal, adjTotal = 0, 0, 0
    counts = Counter(tag for wordType, tag in tagged)
    totalCount = len(singleText.split())
    
    for key, value in counts.items(): 
        if 'NN' in key: 
            nounTotal = nounTotal + 1
        elif 'VB' in key: 
            verbTotal = verbTotal + 1
        elif 'JJ' in key: 
            adjTotal = adjTotal + 1
            
    if totalCount > 0: 
        return [float(verbTotal) / totalCount, float(nounTotal) / totalCount, float(adjTotal) / totalCount]
    else: 
        return [0.0, 0.0, 0.0]
# Test: 
# print(wordPOSCountWithStopwords(data[3]['text']))
# print(data[3]['text'])


def hasURL(text): 
    if re.match(r"(http://[^ ]+)", text) != None: 
        return 1.0
    else: 
        return 0.0

In [22]:
trainMyFeatures60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'noun', 'url'])



In [23]:
# vFreq = countAllFrequencyNaive(validation)
validationMyFeatures60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, vFreq), 60, ['children', 'controv', 'isRoot', 'noun', 'url'])



In [24]:
resultMyClosed = closedFormLinearRegression(trainMyFeatures60[0], trainMyFeatures60[1])
errorMyClosed = meanSquareError(validationMyFeatures60[0], validationMyFeatures60[1], resultMyClosed)
errorMyTrained = meanSquareError(trainMyFeatures60[0], trainMyFeatures60[1], resultMyClosed)
# print('result of closed form: \n', resultMyClosed)
print('error of closed form: \n', errorMyClosed)
print('error of closed trained: \n', errorMyTrained)

error of closed form: 
 1.0208223075830674
error of closed trained: 
 1.084599876507505


In [25]:
import pandas
trainMyFeatures60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'text', 'noun', 'url'])
validationMyFeatures60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, vFreq), 60, ['children', 'controv', 'isRoot', 'text', 'noun', 'url'])



In [26]:
traindf = pandas.DataFrame(trainMyFeatures60[0])
validationdf = pandas.DataFrame(validationMyFeatures60[0])
traindf.shape, validationdf.shape

((10000, 66), (1000, 66))

In [27]:
fullPredictors = set([i for i in range(0, 66)])
print(fullPredictors)
levelBest = [[sys.float_info.max, set()]]
c = 0
for k in range(1, 66): 
    
    prevBest = levelBest[-1][1]
    currentPredictors = fullPredictors - prevBest
    mseVal = []
    
    for predictor in currentPredictors: 
        selected = prevBest | set([predictor])
        
        trainX = traindf[list(selected)].values
        validX = validationdf[list(selected)].values
        trainY = trainMyFeatures60[1]
        validY = validationMyFeatures60[1]
        
        try: 
            resultW = closedFormLinearRegression(trainX, trainY)
        except: 
            continue
        mseVal.append([meanSquareError(validX, validY, resultW), selected])
    
    if len(mseVal) > 0: 
        levelBest.append(min(mseVal, key = lambda x: x[1]))
        print(levelBest)
    
    # Process Bar
    sys.stdout.write('\r')
    count = int((float(c) / float(66)) * 100)
    sys.stdout.write("[%-20s] %d%%" % ('='*int(count / 5), count))
    sleep(0.001)
    sys.stdout.flush()
    c = c + 1

trainX = traindf[list(fullPredictors)].values
validX = validationdf[list(fullPredictors)].values
trainY = trainMyFeatures60[1]
validY = validationMyFeatures60[1]
resultW = closedFormLinearRegression(trainX, trainY)
mseVal.append([selected, meanSquareError(validX, validY, resultW)])

bestOfAll = sorted(levelBest, key = lambda v: v[0])[0]

print(bestOfAll)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65}
[[1.7976931348623157e+308, set()], [1.5280029415908647, {0}]]
[                    ] 0%[[1.7976931348623157e+308, set()], [1.5280029415908647, {0}], [1.5246871624396836, {0, 1}]]
[                    ] 1%[[1.7976931348623157e+308, set()], [1.5280029415908647, {0}], [1.5246871624396836, {0, 1}], [1.4162954264661662, {0, 1, 2}]]
[                    ] 3%[[1.7976931348623157e+308, set()], [1.5280029415908647, {0}], [1.5246871624396836, {0, 1}], [1.4162954264661662, {0, 1, 2}], [1.3500710225535295, {0, 1, 2, 3}]]
[                    ] 4%[[1.7976931348623157e+308, set()], [1.5280029415908647, {0}], [1.5246871624396836, {0, 1}], [1.4162954264661662, {0, 1, 2}], [1.3500710225535295, {0, 1, 2, 3}], [1.3480176875589704, {0, 1, 2, 3, 4}]]
[=   

[=====               ] 25%[[1.7976931348623157e+308, set()], [1.5280029415908647, {0}], [1.5246871624396836, {0, 1}], [1.4162954264661662, {0, 1, 2}], [1.3500710225535295, {0, 1, 2, 3}], [1.3480176875589704, {0, 1, 2, 3, 4}], [1.0208223075830674, {0, 1, 2, 3, 4, 5}], [1.0214159575260346, {0, 1, 2, 3, 4, 5, 6}], [1.0190252374343343, {0, 1, 2, 3, 4, 5, 6, 7}], [1.0188609163971525, {0, 1, 2, 3, 4, 5, 6, 7, 8}], [1.0194067967547549, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}], [1.014384858082088, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}], [1.014296222944646, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}], [1.010010670232153, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}], [1.012758978968836, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}], [1.01573025645501, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}], [1.0154747407120581, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}], [1.0191091130034573, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}], [1.022127120817238, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,







































