<a href="https://colab.research.google.com/github/zbutton314/CS-5560/blob/main/Lab5/code/ICP_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 69.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=2942a526706130fc3970f81620af820a792757ca3eecbe10bafa2701ee10703b
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [108]:
import sys
from __future__ import print_function
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Word2Vec
import pandas as pd
import re
from google.colab import drive

In [63]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
spark = SparkSession.builder.appName("ICP-5").getOrCreate()

In [6]:
drive.mount('/content/drive')

icp5_data_path = "/content/drive/MyDrive/data/ICP-5"

Mounted at /content/drive


# Import Data

### Simple function to import and clean text file

In [117]:
def import_text(path):
  with open(path, 'r') as file:
    s = file.read().rstrip().replace(u'\xa0', u' ')

  s = re.sub(r"\[[0-9]*\]", "", s)

  return s


### Import each of the 5 example text files

In [118]:
# Apache Spark
path_apache = f"{icp5_data_path}/Apache_Spark.txt"
str_apache = import_text(path_apache)
print(f"{path_apache.split('/')[-1]} imported: {len(str_apache)} characters")

# Spark (Fire)
path_fire = f"{icp5_data_path}/Spark.txt"
str_fire = import_text(path_fire)
print(f"{path_fire.split('/')[-1]} imported: {len(str_fire)} characters")

# Chevrolet Spark
path_chevy = f"{icp5_data_path}/Chevrolet_Spark.txt"
str_chevy = import_text(path_chevy)
print(f"{path_chevy.split('/')[-1]} imported: {len(str_chevy)} characters")

# Ayla Ranzz (aka Spark)
path_ayla = f"{icp5_data_path}/Ayla_Ranzz.txt"
str_ayla = import_text(path_ayla)
print(f"{path_ayla.split('/')[-1]} imported: {len(str_ayla)} characters")

# Jordin Sparks
path_jordin = f"{icp5_data_path}/Jordin_Sparks.txt"
str_jordin = import_text(path_jordin)
print(f"{path_jordin.split('/')[-1]} imported: {len(str_jordin)} characters")

Apache_Spark.txt imported: 9267 characters
Spark.txt imported: 6659 characters
Chevrolet_Spark.txt imported: 26672 characters
Ayla_Ranzz.txt imported: 10754 characters
Jordin_Sparks.txt imported: 35819 characters


# TF-IDF

### Function for finding top TF-IDF words

In [96]:
def find_top_tf_idf(spark_df, n, input_col):
  # Find term frequencies
  vectorizer = CountVectorizer(inputCol=input_col, outputCol="rawFeatures").fit(spark_df)
  featurizedData = vectorizer.transform(spark_df)

  # Rescale data (IDF)
  idf = IDF(inputCol="rawFeatures", outputCol="features")
  idfModel = idf.fit(featurizedData)
  rescaledData = idfModel.transform(featurizedData)

  # Create data frame for results
  vocab = vectorizer.vocabulary
  df = pd.DataFrame({"index": range(len(vocab)), "word": vectorizer.vocabulary})

  # Place each document's TF-IDF values into data frame
  for i, name in enumerate(["apache", "fire", "chevy", "ayla", "jordin"]):
    df[f"features_{name}"] = rescaledData.select("features").collect()[i][0].toArray()

  # Print top n TF-IDF words for each document
  d = {}
  for col in df.columns[2:]:
    doc = col.split("_")[-1]
    top_words = df.sort_values(col, ascending=False).head(n)["word"].values
    d[doc] = top_words

  return d

### Find top words from raw input

In [120]:
rawData = spark.createDataFrame([
        (0.0, str_apache),
        (0.1, str_fire),
        (0.2, str_chevy),
        (0.3, str_ayla),
        (0.4, str_jordin)
    ], ["label", "text"])

# Tokenize raw text data
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(rawData)

results_raw = find_top_tf_idf(spark_df=wordsData, n=10, input_col="words")

for doc in results_raw.keys():
  print(f"{doc}: {results_raw[doc]}")

apache: ['apache' 'data' 'rdd' 'spark' 'streaming' 'rdds' 'interface' 'cluster'
 'graphx' 'algorithms']
fire: ['sparks' 'metal' 'sparks.' 'steel' 'especially' 'fire' 'vapor' 'low'
 'burns' 'temperature']
chevy: ['chevrolet' 'matiz' 'car' 'gm' 'daewoo' 'beat' 'marketed' 'south' 'sold'
 'model']
ayla: ['her' 'she' 'ayla' 'legion' 'lightning' 'powers' 'lass' 'brother' 'garth'
 'ranzz']
jordin: ['sparks' 'her' 'music' 'she' "sparks's" 'song' 'album' 'number'
 'american' 'performed']


### Find top words from lemmatized input

In [119]:
words_apache = word_tokenize(str_apache)
words_fire = word_tokenize(str_fire)
words_chevy = word_tokenize(str_chevy)
words_ayla = word_tokenize(str_ayla)
words_jordin = word_tokenize(str_jordin)

lemmatizer = WordNetLemmatizer()
lemmas_apache = [lemmatizer.lemmatize(w) for w in words_apache]
lemmas_fire = [lemmatizer.lemmatize(w) for w in words_fire]
lemmas_chevy = [lemmatizer.lemmatize(w) for w in words_chevy]
lemmas_ayla = [lemmatizer.lemmatize(w) for w in words_ayla]
lemmas_jordin = [lemmatizer.lemmatize(w) for w in words_jordin]

lemmasData = spark.createDataFrame([
        (0.0, lemmas_apache),
        (0.1, lemmas_fire),
        (0.2, lemmas_chevy),
        (0.3, lemmas_ayla),
        (0.4, lemmas_jordin)
    ], ["label", "lemmas"])

results_lemma = find_top_tf_idf(spark_df=lemmasData, n=10, input_col="lemmas")

for doc in results_lemma.keys():
  print(f"{doc}: {results_lemma[doc]}")

apache: ['data' 'Apache' 'RDD' 'RDDs' 'API' 'interface' 'Spark' 'support'
 'algorithm' 'GraphX']
fire: ['spark' 'metal' 'fire' 'particle' 'steel' 'welding' 'phase' 'flint'
 'burn' 'especially']
chevy: ['Chevrolet' 'car' 'Matiz' 'GM' 'Daewoo' 'India' 'sold' 'marketed' 'South'
 'trim']
ayla: ['her' 'Legion' 'Ayla' 'she' 'Lightning' 'power' 'Lass' 'brother' 'Garth'
 'She']
jordin: ['Sparks' 'her' 'album' 'song' 'On' 'Idol' 'Music' 'single' 'I' 'number']


### Find top words from n-gram input

In [121]:
rawData = spark.createDataFrame([
        (0.0, str_apache),
        (0.1, str_fire),
        (0.2, str_chevy),
        (0.3, str_ayla),
        (0.4, str_jordin)
    ], ["label", "text"])

# Tokenize raw text data
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(rawData)

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramsData = ngram.transform(wordsData)

results_ngram = find_top_tf_idf(spark_df=ngramsData, n=10, input_col="ngrams")

for doc in results_ngram.keys():
  print(f"{doc}: {results_ngram[doc]}")

apache: ['apache spark' 'top of' 'the rdd' 'on top' 'the apache' 'apache software'
 'machine learning' 'spark core' 'api is' 'spark sql']
fire: ['sparks. the' 'and so' 'color of' 'the sparks' 'of sparks' 'sparks can'
 'the color' ' ' 'sparks when' 'molten metal']
chevy: ['the chevrolet' 'chevrolet spark' 'the matiz' 'the car' 'version was'
 'the daewoo' 'spark ev' 'in south' 'matiz was' 'the beat']
ayla: ['the legion' 'legion of' 'her brother' 'light lass' 'she is'
 'as lightning' 'ayla ranzz' 'ability to' 'she was' 'of super-heroes']
jordin: ['announced that' 'on may' 'the song' 'sparks was' 'that sparks'
 'on november' 'american idol' 'at number' 'the billboard' 'sparks would']


# W2V

### Find W2V similar words using cosine similarity

In [113]:
def find_similarities(mode, vector_size, doc_selection="all"):
  if mode == "raw":
    spark_df = wordsData
    input_col = "words"
    results_dict = results_raw
  elif mode == "lemmas":
    spark_df = lemmasData
    input_col = "lemmas"
    results_dict = results_lemma
  elif mode == "ngrams":
    spark_df = ngramsData
    input_col = "ngrams"
    results_dict = results_ngram
  else:
    print("Invalid mode: choose raw, lemmas, or ngrams")
    sys.exit()

  if doc_selection == "all":
    selected_docs = results_dict.keys()
  else:
    selected_docs = [doc_selection]

  word2Vec = Word2Vec(vectorSize=vector_size, minCount=0, inputCol=input_col, 
                      outputCol="result")
  model = word2Vec.fit(spark_df)

  for doc in selected_docs:
    print(f"COMPUTING FOR {doc}...\n")
    top_words = results_dict[doc]
    for word in top_words:
      print(f"-- Top Cosine Similarities for {word}:")
      synonyms = model.findSynonyms(word, 5)
      synonyms.show(5)

### Top "synonyms" for apache document, using raw data



In [114]:
find_similarities(mode="raw", vector_size=10, doc_selection="apache")

COMPUTING FOR apache...

-- Top Cosine Similarities for apache:
+--------+------------------+
|    word|        similarity|
+--------+------------------+
| arrived|0.9133138656616211|
|    days|0.8859877586364746|
|admitted|  0.84316486120224|
|     kin|0.8406805396080017|
|unveiled|0.8324633240699768|
+--------+------------------+

-- Top Cosine Similarities for data:
+-------+------------------+
|   word|        similarity|
+-------+------------------+
|  spill|0.8955745697021484|
|   kid.| 0.878497302532196|
|(2012).|0.8733638525009155|
|  songs|0.8370116353034973|
|single.|0.8304732441902161|
+-------+------------------+

-- Top Cosine Similarities for rdd:
+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|     love",|0.8870320916175842|
|   america.| 0.824679970741272|
|parallelism| 0.810459554195404|
|  datasets,|0.7990445494651794|
|   alluxio,|0.7912497520446777|
+-----------+------------------+

-- Top Cosine Similarities for s



+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|manufacture|0.8856542110443115|
|   multiple|0.8686760067939758|
|       21.3|0.8685535192489624|
|    parking|0.8621446490287781|
|   website,|0.8157819509506226|
+-----------+------------------+

-- Top Cosine Similarities for algorithms:
+------------+------------------+
|        word|        similarity|
+------------+------------------+
|introduction|0.9230805039405823|
|       lad's|0.8834492564201355|
|      engine|0.8758612275123596|
|    version,| 0.870771050453186|
|       power|0.8691269159317017|
+------------+------------------+



### Top "synonyms" for apache document, using lemmatized data

In [115]:
find_similarities(mode="lemmas", vector_size=10, doc_selection="apache")

COMPUTING FOR apache...

-- Top Cosine Similarities for Apache:
+---------+------------------+
|     word|        similarity|
+---------+------------------+
|      hot|0.9629665017127991|
|     used|0.9607371091842651|
|automatic|0.9549303650856018|
|     till|0.9545678496360779|
|     than|0.9429707527160645|
+---------+------------------+

-- Top Cosine Similarities for data:
+-----+------------------+
| word|        similarity|
+-----+------------------+
|still|0.9749342799186707|
| used|0.9702635407447815|
|   us|0.9697751402854919|
|brand|  0.96111661195755|
| fire|0.9563091993331909|
+-----+------------------+

-- Top Cosine Similarities for RDD:
+--------+------------------+
|    word|        similarity|
+--------+------------------+
|Eldritch|0.9255056977272034|
|titanium|0.8345174193382263|
|performs|0.8333288431167603|
| January|0.8236171007156372|
|  record|0.7977603673934937|
+--------+------------------+

-- Top Cosine Similarities for RDDs:
+------------+-----------------



+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|      being|0.9533401131629944|
|    trouble|0.9428126215934753|
|      front|0.9362180829048157|
|environment|0.9333353638648987|
|       from|0.9205324053764343|
+-----------+------------------+

-- Top Cosine Similarities for program:
+----------+------------------+
|      word|        similarity|
+----------+------------------+
|      Wolf|0.8926135897636414|
|      teen|0.8474096059799194|
|   holiday|0.8346070647239685|
|non-profit|0.8344035148620605|
|explaining|0.8287461400032043|
+----------+------------------+

-- Top Cosine Similarities for function:
+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|    17-inch|0.8593699336051941|
|alternating|0.8189995884895325|
| scheduling| 0.809699296951294|
|       Gulf|0.8060097694396973|
|       deal|0.8007938861846924|
+-----------+------------------+



### Top "synonyms" for apache document, using n-gram data

In [116]:
find_similarities(mode="ngrams", vector_size=10, doc_selection="apache")

COMPUTING FOR apache...

-- Top Cosine Similarities for apache spark:
+-----------------+------------------+
|             word|        similarity|
+-----------------+------------------+
|sun-eater. later,| 0.939329206943512|
|       her native|0.9139214158058167|
|     episode. the|0.9130833148956299|
|    broadway. she|0.9070369601249695|
|received numerous| 0.904209554195404|
+-----------------+------------------+

-- Top Cosine Similarities for on top:
+--------------------+------------------+
|                word|        similarity|
+--------------------+------------------+
|            began at|0.9341737627983093|
|     the opportunity| 0.922924816608429|
|outlet.[citation ...|0.8885645866394043|
|           film left|0.8702558279037476|
|              a city|0.8697482943534851|
+--------------------+------------------+

-- Top Cosine Similarities for the rdd:
+--------------------+------------------+
|                word|        similarity|
+--------------------+--------------



+-------------------+------------------+
|               word|        similarity|
+-------------------+------------------+
|super-heroes. there|0.9346035718917847|
|      holden barina|0.8798717260360718|
|      as pagerank):| 0.862449049949646|
|           new song|0.8458932638168335|
|         "five year|0.8370183110237122|
+-------------------+------------------+

-- Top Cosine Similarities for apache software:
+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|  spears on|0.9276765584945679|
|m3x concept|0.9235324859619141|
|     an 800|0.9149952530860901|
| lighter or|0.8879521489143372|
|  powers as|0.8864573836326599|
+-----------+------------------+

-- Top Cosine Similarities for top of:
+---------------+------------------+
|           word|        similarity|
+---------------+------------------+
|      gm halted|0.9515533447265625|
| episode, "keep| 0.945679783821106|
|   bring myself|0.9402528405189514|
|   storyline it|0.9325