In [10]:
import pandas as pd
from pyspark import SparkContext, SparkConf
import numpy as np
from collections import defaultdict, Counter
import itertools
import nltk


In [2]:
from nltk.corpus import stopwords,wordnet
import re
def genCorpus(theText):
    #set dictionaries
    stopWords = set(stopwords.words('english'))
    theStemmer = nltk.stem.porter.PorterStemmer() #Martin Porters celebrated stemming algorithm
    
    #pre-processing
    theText = theText.split()
    tokens = [token.lower() for token in theText] #ensure everything is lower case
    tokens = [re.sub(r'[^a-zA-Z0-9]+', ' ',token) for token in tokens] #remove special characters but leave word in tact
    tokens = [token for token in tokens if token.lower().isalpha()] #ensure everything is a letter
    tokens = [word for word in tokens if word not in stopWords] #rid of stop words
    tokens = [theStemmer.stem(word) for word in tokens] #stem words uing porter stemming algorithm
    tokens = " ".join(tokens) #need to pass string seperated by spaces       

    return tokens

In [3]:
conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '8G')
        .set('spark.driver.memory', '12G')
        .set('spark.driver.maxResultSize', '10G'))
sc = SparkContext(conf=conf)

In [4]:
ufo = pd.read_csv('ufo_data.csv')

In [5]:
desc = ufo['Desc']
# desc.to_csv('file.txt', index=False)

In [6]:
desc = list(ufo['Desc'].apply(lambda x: str(x).strip('[]')))

## Problem 1

In [13]:
rdd = sc.parallelize(desc)
cl_rdd = rdd.map(lambda x: genCorpus(x))
words = cl_rdd.flatMap(lambda x: x.split())

In [14]:
wordCount = words.countByValue()
mostCommonWords = dict(Counter(wordCount).most_common(100))

In [15]:
words.count()

8709183

In [16]:

twoWordsRdd = cl_rdd.flatMap(lambda x: list(itertools.combinations(x.split(), 2)))
twoWords = twoWordsRdd.countByValue()

In [18]:
topTwoWords = dict(Counter(twoWords).most_common(100))

In [23]:
lofl = [list(i) for i in topTwoWords]
uniqueWords = list(set([item for i in range(len(lofl)) for item in lofl[i] ]))
wordsToPrint = list([(i,mostCommonWords[i]) for i in uniqueWords])

In [57]:
# [k,v for k in topTwoWords]

links = []
for k,v in topTwoWords.items():
    d = {'source': k[0], 'target': k[1], 'value':v/114000}
    links.append(d)

In [49]:
nodes = []
for i,c in wordsToPrint:
    d = {'id': i,"group":c/100000}
    nodes.append(d)

In [58]:
force = {'nodes':nodes,'links':links}

In [60]:
import json
# json.dumps(force)

with open('force.json', 'w') as fp:
    json.dump(force, fp)