In [1]:
#Upload your kaggle API json file:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"vamsipvs","key":"11b9c77decceaea998a95ec44402006f"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d stackoverflow/stacksample


Downloading stacksample.zip to /content
100% 1.11G/1.11G [00:09<00:00, 151MB/s]



In [4]:
!unzip stacksample.zip 

Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz 
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

In [6]:
## importing libraries
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"
import findspark
findspark.init("spark-3.1.2-bin-hadoop3.2")# SPARK_HOME
from pyspark.sql import SparkSession
session = SparkSession.builder.master("local[*]").getOrCreate()

In [7]:
questions = session.read.option("header","true").csv('Questions.csv').select('Body')


In [8]:
questions.show()

+--------------------+
|                Body|
+--------------------+
|"<p>I've written ...|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
+--------------------+
only showing top 20 rows



In [9]:
questions.count()

33149724

In [10]:
data = questions.na.drop()

In [11]:
data.count()

1404281

In [12]:
data.show()

+--------------------+
|                Body|
+--------------------+
|"<p>I've written ...|
|"<p>Are there any...|
|<p>Has anyone got...|
|<p>This is someth...|
|<p>I have a littl...|
|<p>I am working o...|
|<p>I've been writ...|
|<p>I wonder how y...|
|<p>I would like t...|
|<p>I'm trying to ...|
| and haven't seen...|
|<p>What's the sim...|
|<p>I need to grab...|
|<p>I'm looking fo...|
|<p>What is the co...|
|<p>I am using CCN...|
|<p>I am looking t...|
|<p>I am using MSB...|
|<p>I'm setting up...|
|<p>I always creat...|
+--------------------+
only showing top 20 rows



In [13]:
#regular expression removal
from pyspark.sql.functions import *

def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "<p>", "")
  c = regexp_replace(c, "</p>", "")
  c = regexp_replace(c, "<code>", "")
  c = regexp_replace(c, "</code>", "")
  c = regexp_replace(c, "'ve", "")
  c = regexp_replace(c, "'m¢", "")
  c = regexp_replace(c, "<[^>]+>", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  return c

cleaned_data = data.select(clean_text(col("Body")).alias("Body"))

In [14]:
cleaned_data.show()

+--------------------+
|                Body|
+--------------------+
|i written a datab...|
|are there any rea...|
|has anyone got ex...|
|this is something...|
|i have a little g...|
|i am working on a...|
|i been writing a ...|
|i wonder how you ...|
|i would like the ...|
|im trying to main...|
| and havent seen ...|
|whats the simples...|
|i need to grab th...|
|im looking for a ...|
|what is the corre...|
|i am using ccnet ...|
|i am looking to a...|
|i am using msbuil...|
|im setting up a d...|
|i always create a...|
+--------------------+
only showing top 20 rows



In [15]:
#feature transformation
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf    
from pyspark.sql.types import IntegerType
tokenizer = Tokenizer(inputCol="Body", outputCol="Words")
tokenized = tokenizer.transform(cleaned_data)
no_tokens = udf(lambda words: len(words), IntegerType())
tokenized.select("Body", "Words").withColumn("Number_of_tokens", no_tokens(col("Words"))).show()


+--------------------+--------------------+----------------+
|                Body|               Words|Number_of_tokens|
+--------------------+--------------------+----------------+
|i written a datab...|[i, written, a, d...|              18|
|are there any rea...|[are, there, any,...|              13|
|has anyone got ex...|[has, anyone, got...|               9|
|this is something...|[this, is, someth...|              41|
|i have a little g...|[i, have, a, litt...|              15|
|i am working on a...|[i, am, working, ...|              46|
|i been writing a ...|[i, been, writing...|              46|
|i wonder how you ...|[i, wonder, how, ...|              18|
|i would like the ...|[i, would, like, ...|              47|
|im trying to main...|[im, trying, to, ...|              66|
| and havent seen ...|[, and, havent, s...|              21|
|whats the simples...|[whats, the, simp...|              17|
|i need to grab th...|[i, need, to, gra...|              26|
|im looking for a ...|[i

In [16]:
tokenized.show()

+--------------------+--------------------+
|                Body|               Words|
+--------------------+--------------------+
|i written a datab...|[i, written, a, d...|
|are there any rea...|[are, there, any,...|
|has anyone got ex...|[has, anyone, got...|
|this is something...|[this, is, someth...|
|i have a little g...|[i, have, a, litt...|
|i am working on a...|[i, am, working, ...|
|i been writing a ...|[i, been, writing...|
|i wonder how you ...|[i, wonder, how, ...|
|i would like the ...|[i, would, like, ...|
|im trying to main...|[im, trying, to, ...|
| and havent seen ...|[, and, havent, s...|
|whats the simples...|[whats, the, simp...|
|i need to grab th...|[i, need, to, gra...|
|im looking for a ...|[im, looking, for...|
|what is the corre...|[what, is, the, c...|
|i am using ccnet ...|[i, am, using, cc...|
|i am looking to a...|[i, am, looking, ...|
|i am using msbuil...|[i, am, using, ms...|
|im setting up a d...|[im, setting, up,...|
|i always create a...|[i, always

In [17]:
tokenized.dtypes

[('Body', 'string'), ('Words', 'array<string>')]

In [18]:
from pyspark.ml.feature import StopWordsRemover
default_stopwords = StopWordsRemover()
def_stopwords = default_stopwords.getStopWords() 
def_stopwords[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers']

In [19]:
remover = StopWordsRemover(inputCol="Words", outputCol="Filtered")
Removed_data = remover.transform(tokenized)

In [20]:
removed_data2 = Removed_data.filter(size(col("Filtered")) >=1 )

In [21]:
removed_data2.show()

+--------------------+--------------------+--------------------+
|                Body|               Words|            Filtered|
+--------------------+--------------------+--------------------+
|i written a datab...|[i, written, a, d...|[written, databas...|
|are there any rea...|[are, there, any,...|[really, good, tu...|
|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|
|this is something...|[this, is, someth...|[something, pseud...|
|i have a little g...|[i, have, a, litt...|[little, game, wr...|
|i am working on a...|[i, am, working, ...|[working, collect...|
|i been writing a ...|[i, been, writing...|[writing, web, se...|
|i wonder how you ...|[i, wonder, how, ...|[wonder, guys, ma...|
|i would like the ...|[i, would, like, ...|[like, version, p...|
|im trying to main...|[im, trying, to, ...|[im, trying, main...|
| and havent seen ...|[, and, havent, s...|[, havent, seen, ...|
|whats the simples...|[whats, the, simp...|[whats, simplest,...|
|i need to grab th...|[i,

In [22]:
removed_data2.dtypes

[('Body', 'string'), ('Words', 'array<string>'), ('Filtered', 'array<string>')]

In [23]:
#feature extraction
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="Filtered", outputCol="RawFeatures", numFeatures = 16)
featurizedData = hashingTF.transform(removed_data2)

idf = IDF(inputCol="RawFeatures", outputCol="Features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+
|                Body|               Words|            Filtered|         RawFeatures|            Features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|i written a datab...|[i, written, a, d...|[written, databas...|(16,[0,3,4,6,8,10...|(16,[0,3,4,6,8,10...|
|are there any rea...|[are, there, any,...|[really, good, tu...|(16,[0,1,7,8,10,1...|(16,[0,1,7,8,10,1...|
|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(16,[3,5,6,7,10,1...|(16,[3,5,6,7,10,1...|
|this is something...|[this, is, someth...|[something, pseud...|(16,[1,2,4,6,8,9,...|(16,[1,2,4,6,8,9,...|
|i have a little g...|[i, have, a, litt...|[little, game, wr...|(16,[6,7,8,10,11,...|(16,[6,7,8,10,11,...|
|i am working on a...|[i, am, working, ...|[working, collect...|(16,[1,2,3,4,5,7,...|(16,[1,2,3,4,5,7,...|
|i been writing a ...|[i, been, writi

In [24]:
rescaledData.dtypes

[('Body', 'string'),
 ('Words', 'array<string>'),
 ('Filtered', 'array<string>'),
 ('RawFeatures', 'vector'),
 ('Features', 'vector')]

In [25]:
#locality sensitive hashing
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

mh = MinHashLSH(inputCol="Features", outputCol="Hashes", numHashTables = 10)
model = mh.fit(rescaledData)
Main = model.transform(rescaledData)

In [26]:
Main.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                Body|               Words|            Filtered|         RawFeatures|            Features|              Hashes|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|i written a datab...|[i, written, a, d...|[written, databas...|(16,[0,3,4,6,8,10...|(16,[0,3,4,6,8,10...|[[3.22603552E8], ...|
|are there any rea...|[are, there, any,...|[really, good, tu...|(16,[0,1,7,8,10,1...|(16,[0,1,7,8,10,1...|[[1.52291878E8], ...|
|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(16,[3,5,6,7,10,1...|(16,[3,5,6,7,10,1...|[[1.52291878E8], ...|
|this is something...|[this, is, someth...|[something, pseud...|(16,[1,2,4,6,8,9,...|(16,[1,2,4,6,8,9,...|[[3.22603552E8], ...|
|i have a little g...|[i, have, a, litt...|[little, game, wr...|(16,[6,7,8,10,11,...|(16,[6,7,8,10,11,..

In [27]:
Main.dtypes

[('Body', 'string'),
 ('Words', 'array<string>'),
 ('Filtered', 'array<string>'),
 ('RawFeatures', 'vector'),
 ('Features', 'vector'),
 ('Hashes', 'array<vector>')]

In [28]:
key = rescaledData.select('Features').take(1)
key

[Row(Features=SparseVector(16, {0: 0.7131, 3: 0.7198, 4: 1.2929, 6: 0.8598, 8: 0.7486, 10: 0.7511, 13: 0.7707, 14: 1.739}))]

In [29]:
mainkey = Vectors.sparse(16, {0: 0.7192, 3: 0.728, 4: 1.3066, 6: 0.8661, 8: 0.7596, 10: 0.756, 13: 0.7768, 14: 1.7499})

In [31]:
model.approxNearestNeighbors(rescaledData, mainkey, 10).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
|                Body|               Words|            Filtered|         RawFeatures|            Features|              Hashes|distCol|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
|on selection chan...|[on, selection, c...|[selection, chang...|(16,[0,3,4,6,8,10...|(16,[0,3,4,6,8,10...|[[3.22603552E8], ...|    0.0|
|good day guys can...|[good, day, guys,...|[good, day, guys,...|(16,[0,3,4,6,8,10...|(16,[0,3,4,6,8,10...|[[3.22603552E8], ...|    0.0|
|i want to know wh...|[i, want, to, kno...|[want, know, noti...|(16,[0,3,4,6,8,10...|(16,[0,3,4,6,8,10...|[[3.22603552E8], ...|    0.0|
|i am trying to fi...|[i, am, trying, t...|[trying, find, so...|(16,[0,3,4,6,8,10...|(16,[0,3,4,6,8,10...|[[3.22603552E8], ...|    0.0|
|i am trying to ge...|[i, am, trying, t...|[tryi