In [None]:
import re

import pyspark as ps    # for the pyspark suite
import os               # for environ variables in Part 3

%load_ext autoreload
%autoreload 2

spark = ps.sql.SparkSession.builder \
            .appName("df lecture") \
            .getOrCreate()
        
import numpy as np
import pandas as pd

from pyspark.sql.functions import rand

In [287]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.ml.feature import NGram


In [77]:
# ### this is for converting a silly pandas format to something that is more like a normal json you may want it later

# panda = items.toPandas()

# pcols = panda.columns

# pcols = cycle(pcols)

# jint = 0
# jlist = []
# jdict = {}

# for i in panda.values.flatten():
#     key = pcols.next()
#     jdict[key] = i
#     if key == 'user_id':
#         jlist.append(jdict)
#         jdict = {}
#         jint += 1
   


# with open('../data/acc_dataset_local.json', 'w') as jfile:
#     json.dump(jlist, jfile)

In [30]:
# yelp = spark.read.json('../data/yelp_academic_reviews.json')

In [86]:
yelp = spark.read.json('../data/acc_dataset_local.json')

In [88]:
yelp.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [216]:
def kill_non_ascii(text):
    lets = []
    ntext = text.lower()
    ntext = re.sub("[^a-z' ]",' ',ntext)
#     for letter in ntext:
#         if ord(letter) < 128 :
#             lets.append(letter.lower())
    return ntext.split()

In [240]:
spark.udf.register('sbin', lambda x: 1 if x > 3 else 0)
spark.udf.register('imbin', lambda x: 1 if x > 2 else 0)
spark.udf.register('mkascii', kill_non_ascii)
spark.udf.register('listjoin', lambda x: ' '.join(x))

In [228]:
yelp.registerTempTable('yelp')

r_bin =  spark.sql('''
            SELECT array(mkascii(text)) as content, int(imbin(useful + funny + cool)) as relevant, int(sbin(stars)) as good
            FROM yelp
        ''')


In [229]:
r_bin.first()

Row(content=[u'[saturday, night, late, i, was, getting, warm, when, i, checked, the, thermostat, to, see, if, the, central, ac, was, on, and, yes, it, was, but, it, was, blowing, warm, air, oh, no, so, now, my, air, conditioning, decided, the, day, it, was, degrees, to, stop, working, i, called, sunday, afternoon, and, spoke, with, mark, i, told, him, about, the, issue, with, my, ac, and, he, said, the, earliest, he, could, get, here, was, sometime, monday, i, was, fine, with, that, even, tough, it, was, degrees, in, the, house, my, wife, and, i, were, ok, but, a, bit, worried, about, our, dogs, and, how, they, would, take, the, heat, amy, marks, wife, called, this, morning, to, confirm, that, i, would, be, home, mark, came, around, asked, a, few, questions, and, went, right, to, work, after, diagnosing, the, problem, he, came, back, and, told, us, what, was, wrong, what, needed, to, be, done, and, the, cost, to, have, it, repaired, i, agreed, to, the, repair, about, hour, later, he, w

In [230]:
mincount =  r_bin.filter('relevant > 0').count()

In [231]:
dataset_neg = r_bin.filter('relevant = 0').orderBy(rand()).limit(mincount)
dataset_pos = r_bin.filter('relevant = 1').orderBy(rand()).limit(mincount)

df_relevance = dataset_pos.union(dataset_neg)

In [232]:
df_relevance = df_relevance.drop('good')

In [233]:
df_relevance.printSchema()

root
 |-- content: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- relevant: integer (nullable = true)



In [238]:
remover = StopWordsRemover(inputCol="content", outputCol="filtered")
df_rel_stopped = remover.transform(df_relevance)

In [241]:
df_rel_stopped.printSchema()

root
 |-- content: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- relevant: integer (nullable = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [295]:
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")

ngramDataFrame = ngram.transform(df_rel_stopped)

In [298]:
ngramDataFrame.printSchema()

root
 |-- content: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- relevant: integer (nullable = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngrams: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [302]:
df_rel_stopped.registerTempTable('df_rel_stopped')

stop_strings = spark.sql('''
            SELECT listjoin(filtered) as filtered, content, relevant
            FROM df_rel_stopped
            ''')

### add ngrams later if needed
# ngramDataFrame.registerTempTable('ngrammed_stopped_rel')

# stop_strings = spark.sql('''
#             SELECT listjoin(filtered) as filtered, ngrams, content, relevant
#             FROM ngrammed_stopped_rel
#             ''')

In [303]:
tokenizer = Tokenizer(inputCol="filtered", outputCol="words")
wordsData = tokenizer.transform(stop_strings)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2500)
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# rescaledData.select("label", "features").show()

In [307]:
rescaledData.printSchema()

rescaledData.select('features').first()

root
 |-- filtered: string (nullable = true)
 |-- content: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- relevant: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)



Row(features=SparseVector(2500, {61: 3.2888, 117: 1.9087, 132: 1.286, 224: 1.8984, 232: 2.193, 297: 0.4224, 405: 2.6723, 614: 1.1202, 666: 1.6664, 696: 0.8122, 813: 4.201, 855: 1.2046, 906: 1.1156, 1103: 0.9572, 1142: 4.3087, 1171: 3.3924, 1183: 3.662, 1184: 0.2636, 1207: 2.9454, 1220: 1.1831, 1309: 2.765, 1388: 2.9338, 1468: 0.5055, 1658: 4.9574, 1663: 4.6917, 1685: 0.3915, 1768: 1.6893, 1848: 0.8515, 1863: 2.3563, 1873: 2.0015, 1946: 0.6801, 2092: 1.977, 2096: 2.6723, 2139: 3.6502, 2179: 1.2547, 2200: 0.3792, 2299: 2.713, 2304: 2.258, 2432: 9.5186, 2459: 4.0675, 2473: 2.8406}))

In [78]:
def autostem(cell):
    return cell.asDict()['text']

In [82]:
spark.udf.register('pstem', autostem )

all_corpus = spark.sql('''
                    SELECT count(text)
                    FROM yelp
                    '''
                    )

all_corpus.first()

Row(count(text)=4153150)