In [107]:
import json
import nltk
import re
import itertools
from time import sleep
import sys
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
with open("proj1_data.json") as fp:
    data = json.load(fp)

In [108]:
# word count, remove punc but not stopwords
# @Param: singleText: text to process, numberOfFeatures: 0, 60, 160?
# @Return: vector of count (x in description)
def wordCountWithoutStopwords(singleText, numberOfFeatures): 

    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    countDict = Counter(s.lower() for s in withoutPunc)
        
    i = 0
    returnVector = [0.0 for i in range(0, numberOfFeatures)]
    for key, value in countDict.most_common(numberOfFeatures): 
        if i == numberOfFeatures: 
            break
        returnVector[i] = float(value)
        i = i + 1
    
    return returnVector

# Test: 
# print(wordCountWithoutStopwords([data[8]['text']], 160))
# print(data[8]['text'])



# word count, remove punc and stopwords to imporve model
# @Param: singleText: text to process, numberOfFeatures: 0, 60, 160?
# @Return: vector of count (x in description)
def wordCountWithStopwords(singleText, numberOfFeatures): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    countDict = Counter(s.lower() for s in withoutPunc if s.lower() not in stopwords.words())
    
    i = 0
    returnVector = [0.0 for i in range(0, numberOfFeatures)]
    for key, value in countDict.most_common(numberOfFeatures): 
        if i == numberOfFeatures: 
            break
        returnVector[i] = float(value)
        i = i + 1
    
    return returnVector

# Test: 
# print(wordCountWithStopwords([data[8]['text']], 160))
# print(data[8]['text'])

In [109]:
# word pos count, remove punc but not stopwords
# @Param: singleText: text to process
# @Return: @Return: vector of [verbcount, nouncount, adjcount]
def wordPOSCountWithoutStopwords(singleText): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    tagged = nltk.pos_tag([s.lower() for s in withoutPunc])
    
    verbTotal, nounTotal, adjTotal = 0, 0, 0
    counts = Counter(tag for wordType, tag in tagged)
    totalCount = sum(counts.values())
    
    for key, value in counts.items(): 
        if 'NN' in key: 
            nounTotal = nounTotal + 1
        elif 'VB' in key: 
            verbTotal = verbTotal + 1
        elif 'JJ' in key: 
            adjTotal = adjTotal + 1
            
    if totalCount > 0: 
        return [float(verbTotal) / totalCount, float(nounTotal) / totalCount, float(adjTotal) / totalCount]
    else: 
        return [0.0, 0.0, 0.0]
# Test: 
# print(wordPOSCountWithoutStopwords(data[3]['text']))
# print(data[3]['text'])



# word pos count, remove punc and stopwords to imporve model
# @Param: singleText: text to process
# @Return: vector of [verbcount, nouncount, adjcount]
def wordPOSCountWithStopwords(singleText): 
    
    tokenizer = RegexpTokenizer(r'\w+')
    withoutPunc = tokenizer.tokenize(singleText)
    tagged = nltk.pos_tag([s.lower() for s in withoutPunc if s.lower() not in stopwords.words()])
    
    verbTotal, nounTotal, adjTotal = 0, 0, 0
    counts = Counter(tag for wordType, tag in tagged)
    totalCount = sum(counts.values())
    
    for key, value in counts.items(): 
        if 'NN' in key: 
            nounTotal = nounTotal + 1
        elif 'VB' in key: 
            verbTotal = verbTotal + 1
        elif 'JJ' in key: 
            adjTotal = adjTotal + 1
            
    if totalCount > 0: 
        return [float(verbTotal) / totalCount, float(nounTotal) / totalCount, float(adjTotal) / totalCount]
    else: 
        return [0.0, 0.0, 0.0]
# Test: 
# print(wordPOSCountWithStopwords(data[3]['text']))
# print(data[3]['text'])

In [110]:
def hasURL(text): 
    if re.match(r"(http://[^ ]+)", text) != None: 
        return 1.0
    else: 
        return 0.0

In [111]:
def pythonListTranspose(xl): 
    return list(map(list, itertools.zip_longest(*xl)))

In [128]:
def parseFeatures(dataVector, wordCountFunction, wordPOSFunction, numberOfTextFeatures, featureType): 
    y = []
    childrenFeature = []
    controversialityFeature = []
    isRootFeature = []
    processedTextFeature = []
    verbFeature = []
    nounFeature = []
    adjFeature = []
    urlFeature = []
    crInteractionFeature = []
    
    c = 0
    lenV = len(dataVector)
    for dataPoint in dataVector: 
        y.append(float(dataPoint['popularity_score']))
        numberOfChildren = float(dataPoint['children'])
        childrenFeature.append(float(dataPoint['children']))
        controversialityFeature.append(float(dataPoint['controversiality']))
        if numberOfTextFeatures > 0: 
            processedTextFeature.append(wordCountFunction(dataPoint['text'], numberOfTextFeatures))
        wordAnalysis = wordPOSFunction(dataPoint['text'])
        verbFeature.append(wordAnalysis[0])
        nounFeature.append(wordAnalysis[1])
        adjFeature.append(wordAnalysis[2])
        isRootVar = -1.0
        if data[0]['is_root']: 
            isRootVar = 1.0
            isRootFeature.append(1.0)
        else: 
            isRootVar = 0.0
            isRootFeature.append(0.0)
        urlFeature.append(hasURL(dataPoint['text']))
        crInteractionFeature.append(isRootVar * numberOfChildren)
        
        sys.stdout.write('\r')
        count = int((float(c) / float(lenV)) * 100)
        sys.stdout.write("[%-20s] %d%%" % ('='*int(count / 5), count))
        sys.stdout.flush()
        c = c + 1
    
    mergedBasicFeatures = [childrenFeature, controversialityFeature, isRootFeature]
    if numberOfTextFeatures > 0: 
        mergedTextFeatures = mergedBasicFeatures + list(map(list, zip(*(processedTextFeature))))
    else: 
        mergedTextFeatures = mergedBasicFeatures
    mergedExtraFeatures = mergedTextFeatures + [verbFeature, nounFeature, adjFeature, urlFeature, crInteractionFeature]
    mergedNoTextExtraFeatures = mergedBasicFeatures + [verbFeature, nounFeature, adjFeature, urlFeature, crInteractionFeature]
    
    if featureType == 'basic': 
        return mergedBasicFeatures, y
    elif featureType == 'text': 
        return mergedTextFeatures, y
    elif featureType == 'extra':
        return mergedExtraFeatures, y
    elif featureType == 'extraOnly': 
        return mergedNoTextExtraFeatures, y
    else: 
        return mergedBasicFeatures, y

crazyFeatures = parseFeatures(data[:10], wordCountWithoutStopwords, wordPOSCountWithStopwords, 10, 'extra')
crazyFeaturesX = pythonListTranspose(crazyFeatures[0])
crazyFeaturesY = crazyFeatures[1]
print()
print(crazyFeaturesX)

[[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.25, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.16666666666666666, 0.0, 0.16666666666666666, 0.0, 0.0], [0.0, 0.0, 0.0, 4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.14705882352941177, 0.058823529411764705, 0.029411764705882353, 0.0, 0.0], [0.0, 0.0, 0.0, 10.0, 6.0, 5.0, 5.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 0.11363636363636363, 0.045454545454545456, 0.022727272727272728, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.2222222222222222, 0.1111111111111111, 0.0, 0.0], [1.0, 1.0, 0.0, 3.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.090909090909090

In [105]:
data[0]

{'text': 'ITS RAINING SIDEWAYS',
 'is_root': False,
 'controversiality': 0,
 'children': 0,
 'popularity_score': 1.254698160267241}