<a href="https://colab.research.google.com/github/vaigai138/PySpark-Task1/blob/main/Preprocessing_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Nlp").getOrCreate()

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


In [None]:
sent_df = spark.read.csv("/content/questions.csv")

In [None]:
sent_df.show(5)

+---+----+----+--------------------+--------------------+------------+
|_c0| _c1| _c2|                 _c3|                 _c4|         _c5|
+---+----+----+--------------------+--------------------+------------+
| id|qid1|qid2|           question1|           question2|is_duplicate|
|  0|   1|   2|What is the step ...|What is the step ...|           0|
|  1|   3|   4|What is the story...|What would happen...|           0|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|
+---+----+----+--------------------+--------------------+------------+
only showing top 5 rows



In [None]:
tokenizer = Tokenizer(inputCol="_c3", outputCol="words")

regextokenizer = RegexTokenizer(inputCol="_c3", outputCol="words", pattern="\\W")

count_tokens = udf(lambda words: len(words), IntegerType())

In [None]:
tokenized = tokenizer.transform(sent_df)

In [None]:
tokenized.show(5)

+---+----+----+--------------------+--------------------+------------+--------------------+
|_c0| _c1| _c2|                 _c3|                 _c4|         _c5|               words|
+---+----+----+--------------------+--------------------+------------+--------------------+
| id|qid1|qid2|           question1|           question2|is_duplicate|         [question1]|
|  0|   1|   2|What is the step ...|What is the step ...|           0|[what, is, the, s...|
|  1|   3|   4|What is the story...|What would happen...|           0|[what, is, the, s...|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|[how, can, i, inc...|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|[why, am, i, ment...|
+---+----+----+--------------------+--------------------+------------+--------------------+
only showing top 5 rows



In [None]:
tokenized.select("_c3","words").withColumn("tokens",count_tokens(col("words"))).show(5)

+--------------------+--------------------+------+
|                 _c3|               words|tokens|
+--------------------+--------------------+------+
|           question1|         [question1]|     1|
|What is the step ...|[what, is, the, s...|    14|
|What is the story...|[what, is, the, s...|     8|
|How can I increas...|[how, can, i, inc...|    14|
|Why am I mentally...|[why, am, i, ment...|    11|
+--------------------+--------------------+------+
only showing top 5 rows



In [None]:
regextokenizered = regextokenizer.transform(sent_df)
regextokenizered.select("_c3","words").withColumn("tokens",count_tokens(col("words"))).show(5)

+--------------------+--------------------+------+
|                 _c3|               words|tokens|
+--------------------+--------------------+------+
|           question1|         [question1]|     1|
|What is the step ...|[what, is, the, s...|    14|
|What is the story...|[what, is, the, s...|    10|
|How can I increas...|[how, can, i, inc...|    14|
|Why am I mentally...|[why, am, i, ment...|    11|
+--------------------+--------------------+------+
only showing top 5 rows



In [4]:
from IPython import get_ipython
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover, IDF, HashingTF

spark = SparkSession.builder.appName("Nlp").getOrCreate()

sent_df = spark.read.csv("/content/questions.csv")
sent_df.show(5)


tokenizer = Tokenizer(inputCol="_c3", outputCol="words")
regextokenizer = RegexTokenizer(inputCol="_c3", outputCol="words", pattern="\\W")
count_tokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sent_df)
tokenized.show(5)
tokenized.select("_c3","words").withColumn("tokens",count_tokens(col("words"))).show(5)

regextokenizered = regextokenizer.transform(sent_df)
regextokenizered.select("_c3","words").withColumn("tokens",count_tokens(col("words"))).show(5)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed = remover.transform(regextokenizered)
removed.show(5)

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(removed)
featurizedData.show(5)

# # Apply IDF
# idf = IDF(inputCol="rawFeatures", outputCol="features")
# idfModel = idf.fit(featurizedData)
# rescaledData = idfModel.transform(featurizedData)
# rescaledData.select("words", "features").show(5)

+---+----+----+--------------------+--------------------+------------+
|_c0| _c1| _c2|                 _c3|                 _c4|         _c5|
+---+----+----+--------------------+--------------------+------------+
| id|qid1|qid2|           question1|           question2|is_duplicate|
|  0|   1|   2|What is the step ...|What is the step ...|           0|
|  1|   3|   4|What is the story...|What would happen...|           0|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|
+---+----+----+--------------------+--------------------+------------+
only showing top 5 rows

+---+----+----+--------------------+--------------------+------------+--------------------+
|_c0| _c1| _c2|                 _c3|                 _c4|         _c5|               words|
+---+----+----+--------------------+--------------------+------------+--------------------+
| id|qid1|qid2|           question1|           question2|is_