In [1]:
!pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"veronicaastorino","key":"d70bec0f4a36fef2eb100d1b63c71351"}'}

In [2]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d stackoverflow/stacksample -p ./data

Downloading stacksample.zip to ./data
 99% 1.09G/1.11G [00:13<00:00, 128MB/s]
100% 1.11G/1.11G [00:13<00:00, 85.8MB/s]


In [4]:
!unzip ./data/stacksample.zip

Archive:  ./data/stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz 
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

!pip install -q findspark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 61kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 10.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=e9648d138a151e835e8086b0a77b4a05a61c97071d2fa69b7b5b334873212f52
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [7]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [8]:
import findspark
findspark.init("spark-3.1.1-bin-hadoop2.7")
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

sc = spark.sparkContext

In [9]:
df = spark.read\
  .option("multiLine", "true")\
  .option("header", "true")\
  .option("escape", "\"")\
  .csv("Questions.csv")\
  .select("Id","Body")

df.show(10)

+---+--------------------+
| Id|                Body|
+---+--------------------+
| 80|<p>I've written a...|
| 90|<p>Are there any ...|
|120|<p>Has anyone got...|
|180|<p>This is someth...|
|260|<p>I have a littl...|
|330|<p>I am working o...|
|470|<p>I've been writ...|
|580|<p>I wonder how y...|
|650|<p>I would like t...|
|810|<p>I'm trying to ...|
+---+--------------------+
only showing top 10 rows



In [10]:
# Check for missing values
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----+
| Id|Body|
+---+----+
|  0|   0|
+---+----+



In [11]:
df.count()

1264216

In [12]:
# Clean Text
from pyspark.sql.functions import col, lower, regexp_replace, split

def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "^rt ", "")
  c = regexp_replace(c, "<p>", "")
  c = regexp_replace(c, "</p>", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  return c

clean_text_df = df.select('Id',clean_text(col("Body")).alias("Body"))

In [13]:
clean_text_df.show()

+----+--------------------+
|  Id|                Body|
+----+--------------------+
|  80|ive written a dat...|
|  90|are there any rea...|
| 120|has anyone got ex...|
| 180|this is something...|
| 260|i have a little g...|
| 330|i am working on a...|
| 470|ive been writing ...|
| 580|i wonder how you ...|
| 650|i would like the ...|
| 810|im trying to main...|
| 930|whats the simples...|
|1010|i need to grab th...|
|1040|im looking for a ...|
|1070|what is the corre...|
|1160|i am using ccnet ...|
|1180|i am looking to a...|
|1300|i am using msbuil...|
|1390|im setting up a d...|
|1600|i always create a...|
|1610|if im adding a co...|
+----+--------------------+
only showing top 20 rows



In [14]:
# Tokenizer text
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Body", outputCol="vector")
vector_df = tokenizer.transform(clean_text_df)

vector_df.printSchema()
vector_df.show(10)

root
 |-- Id: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+--------------------+--------------------+
| Id|                Body|              vector|
+---+--------------------+--------------------+
| 80|ive written a dat...|[ive, written, a,...|
| 90|are there any rea...|[are, there, any,...|
|120|has anyone got ex...|[has, anyone, got...|
|180|this is something...|[this, is, someth...|
|260|i have a little g...|[i, have, a, litt...|
|330|i am working on a...|[i, am, working, ...|
|470|ive been writing ...|[ive, been, writi...|
|580|i wonder how you ...|[i, wonder, how, ...|
|650|i would like the ...|[i, would, like, ...|
|810|im trying to main...|[im, trying, to, ...|
+---+--------------------+--------------------+
only showing top 10 rows



In [15]:
# Removing stopwords
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover()
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [16]:
# Specify input/output columns
remover.setInputCol("vector")
remover.setOutputCol("vector_no_stopw")

# Transform existing dataframe with the StopWordsRemover
vector_no_stopw_df = remover.transform(vector_df)

# Display
vector_no_stopw_df.printSchema()
vector_no_stopw_df.show()

root
 |-- Id: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector_no_stopw: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----+--------------------+--------------------+--------------------+
|  Id|                Body|              vector|     vector_no_stopw|
+----+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|
| 180|this is something...|[this, is, someth...|[something, ive, ...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|
| 330|i am working on a...|[i, am, working, ...|[working, collect...|
| 470|ive been writing ...|[ive, been, writi...|[ive, writing, we...|
| 580|i wonder how you ...|[i, wonder, how, .

In [17]:
# TF-IDF with HashingTF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
hashingTF = HashingTF(inputCol="vector_no_stopw", outputCol="rawFeatures", numFeatures=100000)

featurized_data = hashingTF.transform(vector_no_stopw_df)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurized_data)
TfIdfData = idfModel.transform(featurized_data)
TfIdfData.show()

+----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Id|                Body|              vector|     vector_no_stopw|         rawFeatures|            features|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|(100000,[585,3835...|(100000,[585,3835...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|(100000,[2543,162...|(100000,[2543,162...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(100000,[2143,399...|(100000,[2143,399...|
| 180|this is something...|[this, is, someth...|[something, ive, ...|(100000,[6056,606...|(100000,[6056,606...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|(100000,[585,2144...|(100000,[585,2144...|
| 330|i am working on a...|[i, am, working, ...|[working, collect...|(100000,[585,2869...|(100000,[585,2

In [18]:
# MinHashLSH
from pyspark.ml.feature import MinHashLSH
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=1000)
LSHmodel = mh.fit(TfIdfData)

LSH = LSHmodel.transform(TfIdfData)
LSH.show()

+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Id|                Body|              vector|     vector_no_stopw|         rawFeatures|            features|              hashes|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|(100000,[585,3835...|(100000,[585,3835...|[[2.0321607E7], [...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|(100000,[2543,162...|(100000,[2543,162...|[[7098350.0], [19...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(100000,[2143,399...|(100000,[2143,399...|[[3209259.0], [8....|
| 180|this is something...|[this, is, someth...|[something, ive, ...|(100000,[6056,606...|(100000,[6056,606...|[[1.5685882E8], [...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|

In [19]:
#sparse = TfIdfData.select('features').take(1)
#sparse

In [None]:
# Creation of a new DataFrame with some keywords
df = spark.createDataFrame([
        (1, "index,memory,hashing,creation")],
    ['Id',"vector"] 
)
df2 = df.select(split(col("vector"),",").alias("vector"))
df2=remover.transform(df2)
df2=hashingTF.transform(df2)
df2=idfModel.transform(df2)
df2.show()

+--------------------+--------------------+--------------------+--------------------+
|              vector|     vector_no_stopw|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+
|[index, memory, h...|[index, memory, h...|(100000,[8089,269...|(100000,[8089,269...|
+--------------------+--------------------+--------------------+--------------------+



In [None]:
from pyspark.ml.linalg import Vectors
key=df2.take(1)[0][3]
key

SparseVector(100000, {8089: 7.4062, 26941: 3.9947, 29140: 5.296, 55664: 3.6054})

In [None]:
# Perform approximate nearest neighbor search
similar=LSHmodel.approxNearestNeighbors(TfIdfData, key, 10)
similar.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|      Id|                Body|              vector|     vector_no_stopw|         rawFeatures|            features|              hashes|           distCol|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|16560530|how can i change ...|[how, can, i, cha...|[change, index, c...|(100000,[7228,107...|(100000,[7228,107...|[[3.00915426E8], ...|0.7272727272727273|
| 4516710|how is memory acc...|[how, is, memory,...|[memory, accessed...|(100000,[26941,43...|(100000,[26941,43...|[[9.91079297E8], ...|0.8333333333333334|
|13529330|what is uniformly...|[what, is, unifor...|[uniformly, addre...|(100000,[1707,269...|(100000,[1707,269...|[[7.31298015E8], ...|0.8333333333333334|
| 6869880|is it just the fi...|[is, it, just, th...|[file, conte

In [None]:
similar.select('Body').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|Body                                                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------------------------------------+
|how can i change index creation memory using ssms 2008r2 

what is the best practice for index creation memory
                                       |
|how is memory accessed in javascript
                                                                                                                 |
|what is uniformly addressable memory 
                                                                                                                |
|is it just the file contents that get hashed is there any way to include the file