In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window

### Initialize PySpark

In [None]:
spark = SparkSession.builder.appName('wordembed').getOrCreate()

23/12/08 19:51:41 WARN Utils: Your hostname, CelinedeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.100.25 instead (on interface en0)
23/12/08 19:51:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/08 19:51:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/08 19:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/12/08 19:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/12/08 19:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/12/08 19:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


### Load data into Spark dataframe

In [None]:
wembed_df = spark.read.json('/Users/celine/Desktop/5430nlp/group/deduplicated_reviews.json')
wembed_df.count()

                                                                                

123861

In [None]:
wembed_df = wembed_df.select('business_id','name','text')
wembed_df.show(5)

+--------------------+-------------+--------------------+
|         business_id|         name|                text|
+--------------------+-------------+--------------------+
|RI33oswGDkIsc0fuQ...|Oregon Steaks|after several ppl...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|The cheeseburgers...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|In my humble opin...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Overall this plac...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Perfect place for...|
+--------------------+-------------+--------------------+
only showing top 5 rows



In [None]:
wembed_df = wembed_df.withColumn('inputText', F.col('text'))
wembed_df.show(5)

+--------------------+-------------+--------------------+--------------------+
|         business_id|         name|                text|           inputText|
+--------------------+-------------+--------------------+--------------------+
|RI33oswGDkIsc0fuQ...|Oregon Steaks|after several ppl...|after several ppl...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|The cheeseburgers...|The cheeseburgers...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|In my humble opin...|In my humble opin...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Overall this plac...|Overall this plac...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Perfect place for...|Perfect place for...|
+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows



### Train a Word2Vec Model

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

In [None]:
# regular expression tokenizer to tokenize inputText into individual tokens (words)
regextok = RegexTokenizer(gaps = False, pattern = '\w+',
                          inputCol = 'inputText', outputCol = 'tokens')
# StopWordsRemover to remove stopwords in the list of tokens
stopwrmv = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

In [None]:
wembed_df = regextok.transform(wembed_df)
wembed_df = stopwrmv.transform(wembed_df)
wembed_df.show(5)

+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|         name|                text|           inputText|              tokens|   tokens_sw_removed|
+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+
|RI33oswGDkIsc0fuQ...|Oregon Steaks|after several ppl...|after several ppl...|[after, several, ...|[several, ppl, te...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|The cheeseburgers...|The cheeseburgers...|[the, cheeseburge...|[cheeseburgers, g...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|In my humble opin...|In my humble opin...|[in, my, humble, ...|[humble, opinion,...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Overall this plac...|Overall this plac...|[overall, this, p...|[overall, place, ...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Perfect place for...|Perfect place for...|[perfect, place, ...|[perfect, place, ...|
+--------------------+----------

In [None]:
word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
wembed_model = word2vec.fit(wembed_df)


                                                                                

In [None]:
model_path = "/Users/celine/Desktop/5430nlp/group/web/word_embedding_model"
wembed_model.save(model_path)

23/12/08 19:53:25 WARN TaskSetManager: Stage 13 contains a task of very large size (1247 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [None]:

# apply model on data wembed_df
wembed_df2 = wembed_model.transform(wembed_df)
wembed_df2.show(5)

+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|         name|                text|           inputText|              tokens|   tokens_sw_removed|         wordvectors|
+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|RI33oswGDkIsc0fuQ...|Oregon Steaks|after several ppl...|after several ppl...|[after, several, ...|[several, ppl, te...|[-0.0420830252936...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|The cheeseburgers...|The cheeseburgers...|[the, cheeseburge...|[cheeseburgers, g...|[-0.0642510576075...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|In my humble opin...|In my humble opin...|[in, my, humble, ...|[humble, opinion,...|[0.00317551928261...|
|RI33oswGDkIsc0fuQ...|Oregon Steaks|Overall this plac...|Overall this plac...|[overall, this, p...|[overall, place, ...|[-0.0054386841440...|
|RI33o

In [None]:
wembed_df2.select('business_id','name','text','wordvectors').write.format("parquet").save("/Users/celine/Desktop/5430nlp/group/web/dataset_with_vectors.parquet")


23/12/08 19:53:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                