In [1]:
import json
import nltk
import re
import itertools
from time import sleep
import sys
import numpy
import scipy
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
with open("proj1_data.json") as fp:
    data = json.load(fp)

In [2]:
train = data[:10000]
validation = data[10000:11000]
test = data[11000:12000]

## Part 1: feature extraction

In [3]:
def countAllFrequencyNaive(dataSet): 
    totalString = str()
    for d in dataSet: 
        totalString = totalString + ' ' + d['text']
    countNaive = Counter([s for s in totalString.lower().strip().split()])
    totalCount = []
    return list(map(lambda v: v[0], countNaive.most_common(160)))

In [4]:
def countAllFrequencyStopWord(dataSet): 
    
    totalString = str()
    
    for d in dataSet: 
        totalString = totalString + ' ' + d['text']
        
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(totalString)
    
    stopwordsSet = set(stopwords.words())
    
    countDict = Counter(s.lower() for s in withoutPunc if s.lower() not in stopwordsSet)
    
    return list(map(lambda v: v[0], countDict.most_common(160)))

In [5]:
# word count, no removal of punctuations
# @Param: singleText: text to process, numberOfFeatures: 0, 60, 160?
# @Return: vector of count (x in description)
def wordCountNaive(singleText, numberOfFeatures, totalCount): 
    
    countNaive = Counter([s.lower() for s in singleText.split()])
    returnVector = []
    for word in totalCount[:numberOfFeatures]: 
        returnVector.append(float(countNaive[word]))
    return returnVector


# word count, remove punc and stopwords to imporve model
# @Param: singleText: text to process, numberOfFeatures: 0, 60, 160?
# @Return: vector of count (x in description)
def wordCountWithStopwords(singleText, numberOfFeatures, totalCount): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    
    stopwordsSet = set(stopwords.words())
    
    countDict = Counter([s.lower() for s in withoutPunc if s.lower() not in stopwordsSet])
    
    returnVector = []
    for word in totalCount[:numberOfFeatures]: 
        returnVector.append(float(countDict[word]))
    
    return returnVector

In [6]:
def pythonListTranspose(xl): 
    return list(map(list, itertools.zip_longest(*xl)))

In [7]:
# This is used for calculate the ratio between unique char and num of words in a comment
def charWordRatio(comment):
    count = []
    for c in comment:
        if c not in count and ((c >= 'a' and c <= 'z')or( c >= 'A' and c <= 'Z')):
            count.append(c)
    return len(count)/len(comment.split())

In [8]:
def uniqueChar(comment):
    count = []
    for c in comment:
        if c not in count and ((c >= 'a' and c <= 'z')or( c >= 'A' and c <= 'Z')):
            count.append(c)
    return len(count)

In [9]:
# Feature Parser
# @Param: dataVector: sliced original dataset, wordCountFunction: <str> -> ndarray<float>, 
# wordPOSFunction: str -> list<float>, numberOfTextFeature: 0 to shut down Text Processing (Text Features)
# featureType: 
# @Return: tuple<ndarray, list>: xEngineered, yExtracted: rows: vector<samplePoint>, samplePoint[0->2]: basic Features, 
# samplePoint[3->162]: text, samplePoint[163->167]: extra
def parseFeatures(dataVector, wordCountFunction, numberOfTextFeatures, featureType): 
    y = []
    childrenFeature = []
    controversialityFeature = []
    isRootFeature = []
    processedTextFeature = []
    verbFeature = []
    nounFeature = []
    adjFeature = []
    urlFeature = []
    childAndControv = []
    childAndisRoot = []
    allInteracted = []
    controvAndisRoot = []
    identityFeature = []
    # charWordRatioFeature = []
    # uniqueCharFeature = []
    
    c = 0
    lenV = len(dataVector)
    for dataPoint in dataVector: 
        
        # Basic Features
        y.append(float(dataPoint['popularity_score']))
        if 'children' in featureType: 
            childrenFeature.append(float(dataPoint['children']))
        if 'controv' in featureType: 
            controversialityFeature.append(float(dataPoint['controversiality']))
        identityFeature.append(1.0)
        isRootVar = -1.0
        if 'isRoot' in featureType: 
            if dataPoint['is_root'] == True: 
                isRootVar = 1.0
                isRootFeature.append(isRootVar)
            else: 
                isRootVar = 0.0
                isRootFeature.append(isRootVar)
        
        # Text Features: 0 to shut down text feature
        if numberOfTextFeatures > 0 and 'text' in featureType: 
            processedTextFeature.append(wordCountFunction(dataPoint['text'], numberOfTextFeatures))
        
        # Extra Features
        if 'noun' in featureType or 'verb' in featureType or 'adj' in featureType: 
            wordAnalysis = wordPOSCountWithStopwords(dataPoint['text'])
        if 'verb' in featureType: 
            verbFeature.append(wordAnalysis[0])
        if 'noun' in featureType: 
            nounFeature.append(wordAnalysis[1])
        if 'adj' in featureType: 
            adjFeature.append(wordAnalysis[2])
        if 'url' in featureType: 
            urlFeature.append(hasURL(dataPoint['text']))
        
        # charWordRatioFeature.append(charWordRatio(dataPoint['text']))
        # uniqueCharFeature.append(uniqueChar(dataPoint['text']))
        
        # Interaction Effect: 
        if 'child and controv' in featureType: 
            childAndControv.append(float(dataPoint['children']) * float(dataPoint['controversiality']))
        if 'child and isRoot' in featureType: 
            childAndisRoot.append(float(dataPoint['children']) * isRootVar)
        if 'all interacted' in featureType: 
            allInteracted.append(float(dataPoint['children']) * isRootVar  * float(dataPoint['controversiality']))
        if 'controv and isRoot' in featureType: 
            controvAndisRoot.append(isRootVar  * float(dataPoint['controversiality']))
        
        # Process Bar
        sys.stdout.write('\r')
        count = int((float(c) / float(lenV)) * 100)
        sys.stdout.write("[%-20s] %d%%" % ('='*int(count / 5), count))
        sleep(0.001)
        sys.stdout.flush()
        c = c + 1
    if len(processedTextFeature) > 0: 
        processedTextFeature = pythonListTranspose(processedTextFeature)
    returnTotal = [childrenFeature, controversialityFeature, isRootFeature, verbFeature, nounFeature, adjFeature, urlFeature, childAndControv, childAndisRoot, allInteracted, controvAndisRoot, identityFeature] + processedTextFeature
    return pythonListTranspose([x for x in returnTotal if len(x) > 0]), pythonListTranspose([y])

In [10]:
def parseTransformedFeatures(dataVector, wordCountFunction, numberOfTextFeatures, featureType): 
    y = []
    childrenFeature = []
    controversialityFeature = []
    isRootFeature = []
    processedTextFeature = []
    verbFeature = []
    nounFeature = []
    adjFeature = []
    urlFeature = []
    childAndControv = []
    childAndisRoot = []
    allInteracted = []
    controvAndisRoot = []
    identityFeature = []
    # charWordRatioFeature = []
    # uniqueCharFeature = []
    
    c = 0
    lenV = len(dataVector)
    for dataPoint in dataVector: 
        
        # Basic Features
        y.append(float(dataPoint['popularity_score']))
        if 'children' in featureType: 
            childrenFeature.append(1.0 - numpy.exp(-0.04 * float(dataPoint['children'])))
        if 'controv' in featureType: 
            controversialityFeature.append(float(dataPoint['controversiality']))
        identityFeature.append(1.0)
        isRootVar = -1.0
        if 'isRoot' in featureType: 
            if dataPoint['is_root'] == True: 
                isRootVar = 1.0
                isRootFeature.append(isRootVar)
            else: 
                isRootVar = 0.0
                isRootFeature.append(isRootVar)
        
        # Text Features: 0 to shut down text feature
        if numberOfTextFeatures > 0 and 'text' in featureType: 
            processedTextFeature.append([numpy.exp(-0.04 * (a)) * numpy.cos(3 * a) for a in wordCountFunction(dataPoint['text'], numberOfTextFeatures)])
            #  
        
        # Extra Features
        if 'noun' in featureType or 'verb' in featureType or 'adj' in featureType: 
            wordAnalysis = wordPOSCountWithStopwords(dataPoint['text'])
        if 'verb' in featureType: 
            verbFeature.append(wordAnalysis[0])
        if 'noun' in featureType: 
            nounFeature.append(wordAnalysis[1])
        if 'adj' in featureType: 
            adjFeature.append(wordAnalysis[2])
        if 'url' in featureType: 
            urlFeature.append(hasURL(dataPoint['text']))
            
        # Interaction Effect: 
        if 'child and controv' in featureType: 
            childAndControv.append(float(dataPoint['children']) * float(dataPoint['controversiality']))
        if 'child and isRoot' in featureType: 
            childAndisRoot.append(float(dataPoint['children']) * isRootVar)
        if 'all interacted' in featureType: 
            allInteracted.append(float(dataPoint['children']) * isRootVar * float(dataPoint['controversiality']))
        if 'controv and isRoot' in featureType: 
            controvAndisRoot.append(isRootVar  * float(dataPoint['controversiality']))
        
        # charWordRatioFeature.append(charWordRatio(dataPoint['text']))
        # uniqueCharFeature.append(uniqueChar(dataPoint['text']))
        
        # Process Bar
        sys.stdout.write('\r')
        count = int((float(c) / float(lenV)) * 100)
        sys.stdout.write("[%-20s] %d%%" % ('='*int(count / 5), count))
        sleep(0.001)
        sys.stdout.flush()
        c = c + 1
    
    if len(processedTextFeature) > 0: 
        processedTextFeature = pythonListTranspose(processedTextFeature)
        
    returnTotal = [childrenFeature, controversialityFeature, isRootFeature, verbFeature, nounFeature, adjFeature, urlFeature, childAndControv, childAndisRoot, allInteracted, controvAndisRoot, identityFeature] + processedTextFeature
    return pythonListTranspose([x for x in returnTotal if len(x) > 0]), pythonListTranspose([y])

## Part 2: Regression Algorithms

In [11]:
def meanSquareError(valX, valY, w): 
    diffenence = numpy.power(numpy.transpose(numpy.subtract(valY, numpy.matmul(valX, w)))[0], 2)
    return numpy.divide(numpy.sum(diffenence), len(valY))

In [12]:
def closedFormLinearRegression(x, y): 
    xT = numpy.transpose(numpy.array(x))
    return numpy.matmul(numpy.matmul(scipy.linalg.inv(numpy.matmul(xT, numpy.array(x))), xT), numpy.array(y))

In [13]:
def gradientDescentLinearRegression(learnRateFunction, x, y, tol): 
    i = 1
    weight = numpy.array([[0.0] for l in range(len(x[0]))])
    weightN = numpy.array([[0.0] for l in range(len(x[0]))])
    xT = numpy.transpose(x)
    xTx = numpy.matmul(xT, x)
    xTy = numpy.matmul(xT, y)
    while True: 
        weight = weightN
        weightN = numpy.subtract(weight, 2 * learnRateFunction(i) * numpy.subtract(numpy.matmul(xTx, weight), xTy))
        i = i + 1
        if numpy.linalg.norm(numpy.subtract(weightN, weight), 2) <= tol: 
            break
    return weightN

## Part 3: Closed Form and Gradient Descent

In [14]:
trainFeatures = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 0, ['children', 'controv', 'isRoot'])
validationFeatures = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, vFreq), 0, ['children', 'controv', 'isRoot'])



In [15]:
resultClosed = closedFormLinearRegression(trainFeatures[0], trainFeatures[1])
errorClosed = meanSquareError(validationFeatures[0], validationFeatures[1], resultClosed)
errorTrained = meanSquareError(trainFeatures[0], trainFeatures[1], resultClosed)
print('result of closed form: \n', resultClosed)
print('error of closed form: \n', errorClosed)
print('error of closed trained: \n', errorTrained)

result of closed form: 
 [[ 0.37536403]
 [-1.08584747]
 [-0.22627679]
 [ 0.82092517]]
error of closed form: 
 1.0203266848431447
error of closed trained: 
 1.0846830709157251


In [16]:
resultGradient = gradientDescentLinearRegression(lambda v: float(0.0020 / (float(v) + 7.0)), trainFeatures[0], trainFeatures[1], 0.00000005)
errorGradient = meanSquareError(validationFeatures[0], validationFeatures[1], resultGradient)
print('result of gradient descent: \n', resultGradient)
print('error of gradient descent: \n', errorGradient)

result of gradient descent: 
 [[ 0.37533096]
 [-1.07318692]
 [-0.22619012]
 [ 0.82075189]]
error of gradient descent: 
 1.020380531559443


conclusion: closed form gives less error

## Part 3: 60 and 160 text features

In [17]:
tFreq = countAllFrequencyNaive(train)
trainFeatures60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'text'])
trainFeatures160 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 160, ['children', 'controv', 'isRoot', 'text'])



In [18]:
vFreq = countAllFrequencyNaive(validation)
validationFeatures60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'text'])
validationFeatures160 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 160, ['children', 'controv', 'isRoot', 'text'])



In [None]:
resultClosed60 = closedFormLinearRegression(trainFeatures60[0], trainFeatures60[1])
errorClosed60 = meanSquareError(validationFeatures60[0], validationFeatures60[1], resultClosed60)
errorTrained60 = meanSquareError(trainFeatures60[0], trainFeatures60[1], resultClosed60)
# print('result of closed form: \n', resultClosed60)
print('error of closed form: \n', errorClosed60)
print('error of closed trained: \n', errorTrained60)

error of closed form: 
 0.9839397297217666
error of closed trained: 
 1.060429141685383


In [None]:
resultClosed160 = closedFormLinearRegression(trainFeatures160[0], trainFeatures160[1])
errorClosed160 = meanSquareError(validationFeatures160[0], validationFeatures160[1], resultClosed160)
errorTrained160 = meanSquareError(trainFeatures160[0], trainFeatures160[1], resultClosed160)
# print('result of closed form: \n', resultClosed160)
print('error of closed form: \n', errorClosed160)
print('error of closed trained: \n', errorTrained160)
len(trainFeatures160[1])

error of closed form: 
 0.9950693970669265
error of closed trained: 
 1.0477763217987115


10000

conclusion: 60 features are better than 160, validation error larger than closed form in 160: potentially overfitting

## Extra experiment: count word frequencies and eliminate stopwords

In [None]:
tFreqS = countAllFrequencyStopWord(train)
trainFeatures60S = parseFeatures(data[:10000], lambda u, v: wordCountWithStopwords(u, v, tFreqS), 60, ['children', 'controv', 'isRoot', 'text'])
trainFeatures160S = parseFeatures(data[:10000], lambda u, v: wordCountWithStopwords(u, v, tFreqS), 160, ['children', 'controv', 'isRoot', 'text'])

In [None]:
validationFeatures60S = parseFeatures(validation, lambda u, v: wordCountWithStopwords(u, v, tFreqS), 60, ['children', 'controv', 'isRoot', 'text'])
validationFeatures160S = parseFeatures(validation, lambda u, v: wordCountWithStopwords(u, v, tFreqS), 160, ['children', 'controv', 'isRoot', 'text'])

In [None]:
resultClosed60S = closedFormLinearRegression(trainFeatures60S[0], trainFeatures60S[1])
errorClosed60S = meanSquareError(validationFeatures60S[0], validationFeatures60S[1], resultClosed60S)
errorTrained60S = meanSquareError(trainFeatures60S[0], trainFeatures60S[1], resultClosed60S)
# print('result of closed form: \n', resultClosed60S)
print('error of closed form: \n', errorClosed60S)
print('error of closed trained: \n', errorTrained60S)

In [None]:
resultClosed160S = closedFormLinearRegression(trainFeatures160S[0], trainFeatures160S[1])
errorClosed160S = meanSquareError(validationFeatures160S[0], validationFeatures160S[1], resultClosed160S)
errorTrained160S = meanSquareError(trainFeatures160S[0], trainFeatures160S[1], resultClosed160S)
# print('result of closed form: \n', resultClosed160S)
print('error of closed form: \n', errorClosed160S)
print('error of closed trained: \n', errorTrained160S)
len(trainFeatures160[1])

conclusion: eliminating stopwords does not gives better result

## Experimenting with new features: fraction of noun and existence of URL

In [None]:
# word pos count, remove punc and stopwords to imporve model
# @Param: singleText: text to process
# @Return: vector of [verbcount, nouncount, adjcount]
def wordPOSCountWithStopwords(singleText): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    setOfStop = set(stopwords.words())
    tagged = nltk.pos_tag([s.lower() for s in withoutPunc if s.lower() not in setOfStop])
    
    verbTotal, nounTotal, adjTotal = 0, 0, 0
    counts = Counter(tag for wordType, tag in tagged)
    totalCount = len(singleText.split())
    
    for key, value in counts.items(): 
        if 'NN' in key: 
            nounTotal = nounTotal + 1
        elif 'VB' in key: 
            verbTotal = verbTotal + 1
        elif 'JJ' in key: 
            adjTotal = adjTotal + 1
            
    if totalCount > 0: 
        return [float(verbTotal) / totalCount, float(nounTotal) / totalCount, float(adjTotal) / totalCount]
    else: 
        return [0.0, 0.0, 0.0]
# Test: 
# print(wordPOSCountWithStopwords(data[3]['text']))
# print(data[3]['text'])


def hasURL(text): 
    if re.match(r"(http://[^ ]+)", text) != None: 
        return 1.0
    else: 
        return 0.0

### With fraction of noun in whole words

In [None]:
trainMyFeatures60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'noun'])

In [None]:
# vFreq = countAllFrequencyNaive(validation)
validationMyFeatures60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'noun'])

In [None]:
resultMyClosed = closedFormLinearRegression(trainMyFeatures60[0], trainMyFeatures60[1])
errorMyClosed = meanSquareError(validationMyFeatures60[0], validationMyFeatures60[1], resultMyClosed)
errorMyTrained = meanSquareError(trainMyFeatures60[0], trainMyFeatures60[1], resultMyClosed)
# print('result of closed form: \n', resultMyClosed)
print('error of closed form: \n', errorMyClosed)
print('error of closed trained: \n', errorMyTrained)

### With URL

In [None]:
trainMyFeaturesURL60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'url'])
validationMyFeaturesURL60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'url'])

In [None]:
resultMyClosedURL = closedFormLinearRegression(trainMyFeaturesURL60[0], trainMyFeaturesURL60[1])
errorMyClosedURL = meanSquareError(validationMyFeaturesURL60[0], validationMyFeaturesURL60[1], resultMyClosedURL)
errorMyTrainedURL = meanSquareError(trainMyFeaturesURL60[0], trainMyFeaturesURL60[1], resultMyClosedURL)
# print('result of closed form: \n', resultMyClosedURL)
print('error of closed form: \n', errorMyClosedURL)
print('error of closed trained: \n', errorMyTrainedURL)

## With all

In [None]:
trainMyFeaturesALL60 = parseFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'noun', 'url'])
validationMyFeaturesALL60 = parseFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 60, ['children', 'controv', 'isRoot', 'noun', 'url'])

In [None]:
resultMyClosedALL = closedFormLinearRegression(trainMyFeaturesALL60[0], trainMyFeaturesALL60[1])
errorMyClosedALL = meanSquareError(validationMyFeaturesALL60[0], validationMyFeaturesALL60[1], resultMyClosedALL)
errorMyTrainedALL = meanSquareError(trainMyFeaturesALL60[0], trainMyFeaturesALL60[1], resultMyClosedALL)
# print('result of closed form: \n', resultMyClosedALL)
print('error of closed form: \n', errorMyClosedALL)
print('error of closed trained: \n', errorMyTrainedALL)

conclusion: adding extra features does not help with accuracy

x4(noun) and x5(url) has no significant coefficient

### Test of coefficient (Hastie. et, al)

In [None]:
import statsmodels.api as sm
sm.OLS(trainMyFeaturesALL60[1], trainMyFeaturesALL60[0]).fit().summary()

### (Improved!) Interaction Effect among basic features

In [None]:
trainFeaturesI = parseTransformedFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 0, ['children', 'controv', 'isRoot', 'child and controv', 'child and isRoot', 'all interacted', 'controv and isRoot'])
validationFeaturesI = parseTransformedFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 0, ['children', 'controv', 'isRoot', 'child and controv', 'child and isRoot', 'all interacted', 'controv and isRoot']) 

In [None]:
resultClosedI = closedFormLinearRegression(trainFeaturesI[0], trainFeaturesI[1])
errorClosedI = meanSquareError(validationFeaturesI[0], validationFeaturesI[1], resultClosedI)
errorTrainedI = meanSquareError(trainFeaturesI[0], trainFeaturesI[1], resultClosedI)
print('result of closed form: \n', resultClosedI)
print('error of closed form: \n', errorClosedI)
print('error of closed trained: \n', errorTrainedI)

Conclusion: use of g(x) = 1-exp(-0.04x) to transform children variable, and interact the transformed children with isRoot decreases loss

In [None]:
sm.OLS(trainFeaturesI[1], trainFeaturesI[0]).fit().summary()

conclusion: interaction terms are not significant (all p-values are not smaller than 0.001)

## Extra experiment: Forward Selection

In [None]:
import pandas
trainMyFeatures60 = parseTransformedFeatures(train, lambda u, v: wordCountNaive(u, v, tFreq), 160, ['children', 'controv', 'isRoot', 'child and controv', 'text']) # , 'child and controv', 'text'
validationMyFeatures60 = parseTransformedFeatures(validation, lambda u, v: wordCountNaive(u, v, tFreq), 160, ['children', 'controv', 'child and controv', 'isRoot', 'text'])
testMyFeatures60 = parseTransformedFeatures(test, lambda u, v: wordCountNaive(u, v, tFreq), 160, ['children', 'controv', 'isRoot', 'child and controv', 'text'])
# , 'noun', 'url'

In [None]:
import matplotlib.pyplot as plt
plt.scatter(numpy.transpose(trainMyFeatures60[0])[6], numpy.transpose(trainMyFeatures60[1]), alpha = 0.05)

In [None]:
traindf = pandas.DataFrame(trainMyFeatures60[0])
validationdf = pandas.DataFrame(validationMyFeatures60[0])
traindf.shape, validationdf.shape

In [None]:
fullPredictors = set([i for i in range(0, 165)])
levelBest = [[sys.float_info.max, set()]]
c = 0
for k in range(1, len(fullPredictors)): 
    
    prevBest = min([l for l in levelBest if len(l[1]) == k - 1], key = lambda x: x[0])[1]
    currentPredictors = fullPredictors - prevBest
    mseVal = []
    
    for predictor in currentPredictors: 
        selected = prevBest | set([predictor])
        
        trainX = traindf[list(selected)].values
        validX = validationdf[list(selected)].values
        trainY = trainMyFeatures60[1]
        validY = validationMyFeatures60[1]
        
        try: 
            resultW = closedFormLinearRegression(trainX, trainY)
        except: 
            continue
        mseVal.append([meanSquareError(validX, validY, resultW), selected])
    
    currentBest = min(mseVal, key = lambda x: x[0])
    
    if len(mseVal) > 0: 
        levelBest.append(currentBest)
        print(currentBest)
    
    # Process Bar
    sys.stdout.write('\r')
    count = int((float(c) / float(len(fullPredictors))) * 100)
    sys.stdout.write("[%-20s] %d%%" % ('='*int(count / 5), count))
    sleep(0.001)
    sys.stdout.flush()
    c = c + 1
    

trainX = traindf[list(fullPredictors)].values
validX = validationdf[list(fullPredictors)].values
trainY = trainMyFeatures60[1]
validY = validationMyFeatures60[1]
ok = True
try: 
    resultW = closedFormLinearRegression(trainX, trainY)
except: 
    ok = False
if ok: 
    mseVal.append([meanSquareError(validX, validY, resultW), selected])
bestOfAll = sorted(levelBest, key = lambda v: v[0])[0]

In [None]:
print('best set of predictors with interaction term: \n', bestOfAll[0])
print('error: \n', bestOfAll[1])

## Final Performance

In [None]:
resultClosedFinal = closedFormLinearRegression(numpy.array(trainMyFeatures60[0])[:,list(bestOfAll[1])], trainMyFeatures60[1])
meanSquareError(numpy.array(testMyFeatures60[0])[:,list(bestOfAll[1])], testMyFeatures60[1], resultClosedFinal)

Conclusion: overfits validation set (meta-overfitting)