In [1]:
!pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (2).json


{'kaggle.json': b'{"username":"veronicaastorino","key":"d70bec0f4a36fef2eb100d1b63c71351"}'}

In [2]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [3]:
!kaggle datasets download -d stackoverflow/stacksample -p ./data

stacksample.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
!unzip ./data/stacksample.zip

Archive:  ./data/stacksample.zip
replace Answers.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace Questions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace Tags.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no


In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz 
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

!pip install -q findspark
!pip install pyspark



In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [7]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [8]:
import findspark
findspark.init("spark-3.1.1-bin-hadoop2.7")
from pyspark.sql import SparkSession

spark = SparkSession.builder\
      .master("local")\
        .appName("similaritems")\
        .config('spark.ui.port', '4050')\
        .config("spark.driver.memory", "2g") \
        .config("spark.sql.broadcastTimeout", "10h") \
        .config("spark.executor.memory", "10g") \
        .config('spark.sql.shuffle.partitions', '3') \
        .config("spark.sql.execution.arrow.enabled","true") \
        .getOrCreate()

sc = spark.sparkContext

In [9]:
df = spark.read\
  .option("multiLine", "true")\
  .option("header", "true")\
  .option("escape", "\"")\
  .csv("Questions.csv")\
  .select("Id","Body")

df.show(10)

+---+--------------------+
| Id|                Body|
+---+--------------------+
| 80|<p>I've written a...|
| 90|<p>Are there any ...|
|120|<p>Has anyone got...|
|180|<p>This is someth...|
|260|<p>I have a littl...|
|330|<p>I am working o...|
|470|<p>I've been writ...|
|580|<p>I wonder how y...|
|650|<p>I would like t...|
|810|<p>I'm trying to ...|
+---+--------------------+
only showing top 10 rows



In [10]:
# Check for missing values
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----+
| Id|Body|
+---+----+
|  0|   0|
+---+----+



In [11]:
df.count()

1264216

In [12]:
# Clean Text
from pyspark.sql.functions import col, lower, regexp_replace, split

def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "^rt ", "")
  c = regexp_replace(c, "<p>", "")
  c = regexp_replace(c, "</p>", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  return c

clean_text_df = df.select('Id',clean_text(col("Body")).alias("Body"))

In [13]:
clean_text_df.show()

+----+--------------------+
|  Id|                Body|
+----+--------------------+
|  80|ive written a dat...|
|  90|are there any rea...|
| 120|has anyone got ex...|
| 180|this is something...|
| 260|i have a little g...|
| 330|i am working on a...|
| 470|ive been writing ...|
| 580|i wonder how you ...|
| 650|i would like the ...|
| 810|im trying to main...|
| 930|whats the simples...|
|1010|i need to grab th...|
|1040|im looking for a ...|
|1070|what is the corre...|
|1160|i am using ccnet ...|
|1180|i am looking to a...|
|1300|i am using msbuil...|
|1390|im setting up a d...|
|1600|i always create a...|
|1610|if im adding a co...|
+----+--------------------+
only showing top 20 rows



In [14]:
# Tokenizer text
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Body", outputCol="vector")
vector_df = tokenizer.transform(clean_text_df)

vector_df.printSchema()
vector_df.show(10)

root
 |-- Id: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+--------------------+--------------------+
| Id|                Body|              vector|
+---+--------------------+--------------------+
| 80|ive written a dat...|[ive, written, a,...|
| 90|are there any rea...|[are, there, any,...|
|120|has anyone got ex...|[has, anyone, got...|
|180|this is something...|[this, is, someth...|
|260|i have a little g...|[i, have, a, litt...|
|330|i am working on a...|[i, am, working, ...|
|470|ive been writing ...|[ive, been, writi...|
|580|i wonder how you ...|[i, wonder, how, ...|
|650|i would like the ...|[i, would, like, ...|
|810|im trying to main...|[im, trying, to, ...|
+---+--------------------+--------------------+
only showing top 10 rows



In [15]:
# Removing stopwords
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover()
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [16]:
# Specify input/output columns
remover.setInputCol("vector")
remover.setOutputCol("vector_no_stopw")

# Transform existing dataframe with the StopWordsRemover
vector_no_stopw_df = remover.transform(vector_df)

# Display
vector_no_stopw_df.printSchema()
vector_no_stopw_df.show()

root
 |-- Id: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector_no_stopw: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----+--------------------+--------------------+--------------------+
|  Id|                Body|              vector|     vector_no_stopw|
+----+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|
| 180|this is something...|[this, is, someth...|[something, ive, ...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|
| 330|i am working on a...|[i, am, working, ...|[working, collect...|
| 470|ive been writing ...|[ive, been, writi...|[ive, writing, we...|
| 580|i wonder how you ...|[i, wonder, how, .

In [17]:
# TF-IDF with HashingTF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
hashingTF = HashingTF(inputCol="vector_no_stopw", outputCol="rawFeatures", numFeatures=100000)

featurized_data = hashingTF.transform(vector_no_stopw_df)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurized_data)
TfIdfData = idfModel.transform(featurized_data)
TfIdfData.show()

+----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Id|                Body|              vector|     vector_no_stopw|         rawFeatures|            features|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|(100000,[585,3835...|(100000,[585,3835...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|(100000,[2543,162...|(100000,[2543,162...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(100000,[2143,399...|(100000,[2143,399...|
| 180|this is something...|[this, is, someth...|[something, ive, ...|(100000,[6056,606...|(100000,[6056,606...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|(100000,[585,2144...|(100000,[585,2144...|
| 330|i am working on a...|[i, am, working, ...|[working, collect...|(100000,[585,2869...|(100000,[585,2

In [18]:
# MinHashLSH
from pyspark.ml.feature import MinHashLSH
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=1000)
LSHmodel = mh.fit(TfIdfData)

LSH = LSHmodel.transform(TfIdfData)
LSH.show()

+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Id|                Body|              vector|     vector_no_stopw|         rawFeatures|            features|              hashes|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|(100000,[585,3835...|(100000,[585,3835...|[[1.8924956E7], [...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|(100000,[2543,162...|(100000,[2543,162...|[[7.5910755E7], [...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(100000,[2143,399...|(100000,[2143,399...|[[2.2088109E7], [...|
| 180|this is something...|[this, is, someth...|[something, ive, ...|(100000,[6056,606...|(100000,[6056,606...|[[7.6857533E7], [...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|

In [19]:
from pyspark.sql.functions import col
TfIdfData.select('Body').take(1)[0][0]

'ive written a database generation script in a href and want to execute it in my a href aira application\n\nprecodecreate table trole \n      roleid integer primary key\n      rolename varchar40\n\ncreate table tfile \n    fileid integer primary key\n    filename varchar50\n    filedescription varchar500\n    thumbnailid integer\n    fileformatid integer\n    categoryid integer\n    isfavorite boolean\n    dateadded date\n    globalaccesscount integer\n    lastaccesstime date\n    downloadcomplete boolean\n    isnew boolean\n    isspotlight boolean\n    duration varchar30\n\ncreate table tcategory \n    categoryid integer primary key\n    categoryname varchar50\n    parentcategoryid integer\n\n\ncodepre\n\ni execute this in adobe air using the following methods\n\nprecodepublic static function runsqlfromfilefilenamestringvoid \n    var filefile  fileapplicationdirectoryresolvepathfilename\n    var streamfilestream  new filestream\n    streamopenfile filemoderead\n    var strsqlstring  

In [20]:
key = TfIdfData.select('features').take(1)
key

[Row(features=SparseVector(100000, {585: 4.318, 3835: 8.089, 4967: 2.9923, 5563: 8.9624, 5629: 8.9686, 9348: 1.8802, 10268: 8.9142, 10960: 1.1156, 11398: 8.938, 11404: 4.404, 11834: 3.6739, 12693: 12.7285, 14128: 7.9319, 15108: 8.932, 15223: 8.8242, 15662: 7.6692, 17712: 1.3185, 19012: 9.1521, 19812: 3.9605, 20269: 7.5293, 20769: 8.4081, 21770: 2.2645, 24833: 5.6486, 26313: 3.7736, 26523: 4.5205, 27531: 8.5735, 27949: 3.5499, 29216: 4.6605, 32535: 4.6344, 32857: 7.9209, 32858: 2.7265, 33420: 13.8781, 34488: 8.9025, 34792: 3.6956, 35146: 2.5943, 36712: 6.82, 36963: 6.0981, 37126: 32.7503, 40088: 6.7007, 40959: 9.1521, 43931: 7.8268, 46536: 3.5043, 48201: 4.145, 48382: 2.3605, 49453: 3.0772, 51174: 3.8357, 52887: 15.7319, 53131: 9.7581, 54617: 8.1949, 54693: 2.2981, 55039: 6.017, 56723: 12.232, 59427: 9.0195, 59936: 8.2661, 62891: 6.7488, 63673: 7.6991, 65432: 3.2491, 65528: 9.046, 66133: 13.6519, 66537: 8.2569, 67140: 9.1902, 68012: 7.202, 68619: 2.7382, 69129: 0.7928, 70815: 1.6764, 70

In [21]:
# Perform approximate nearest neighbor search
similar=LSHmodel.approxNearestNeighbors(TfIdfData, key[0][0], 10)
similar.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|      Id|                Body|              vector|     vector_no_stopw|         rawFeatures|            features|              hashes|           distCol|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|      80|ive written a dat...|[ive, written, a,...|[ive, written, da...|(100000,[585,3835...|(100000,[585,3835...|[[1.8924956E7], [...|               0.0|
| 7882580|i am facing probl...|[i, am, facing, p...|[facing, problem,...|(100000,[585,1666...|(100000,[585,1666...|[[1.8924956E7], [...|0.8707482993197279|
| 6270600|i am writing some...|[i, am, writing, ...|[writing, code, p...|(100000,[495,585,...|(100000,[495,585,...|[[2.2853516E7], [...|0.8733333333333333|
|28078370|im using flask sq...|[im, using, flask...|[im, using, 

In [22]:
similar.select('Body').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------