In [74]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz
!tar xf spark-3.2.3-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [75]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks


In [76]:
data = spark.read.csv("/content/drive/My Drive/ML2textdata.csv",inferSchema=True, header =True)

In [77]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)



In [81]:
data.show()

+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|                  id|             host_id|                name|         description|neighborhood_overview|          host_about|   host_is_superhost|
+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|                8357|               24281|The Mushroom Dome...|Next available da...| Even though we li...|We live on 10 bea...|                null|
|I have been hosti...| 2009 and really ...|                null|                null|                 null|                null|                null|
|We welcome you to...|                   t|                null|                null|                 null|                null|                null|
|               11869|               44764|Cottage by the Re...|This is a very pr...| This is a quie

## EDA

In [78]:
from pyspark.sql.functions import desc

data.groupBy('host_is_superhost')\
  .count()\
  .sort(desc("count"))\
  .show()

+--------------------+------+
|   host_is_superhost| count|
+--------------------+------+
|                null|151864|
|                   f| 57599|
|                   t| 28758|
|     access to pools|  1756|
|      Presa Canarios|   756|
|              family|   267|
|   and Hawaii Island|   250|
| we aim to provid...|   230|
|         conditioner|   177|
|          condiments|   174|
| as well as 24/7 ...|   128|
|          Ocean View|   126|
| and we include a...|   104|
| Claire or Christ...|    98|
| the name on the ...|    96|
| respond and assi...|    90|
|         Paso Robles|    84|
|              hiking|    60|
|              diving|    58|
| and concierge fe...|    58|
+--------------------+------+
only showing top 20 rows



In [80]:
df_target = data.filter(col("host_is_superhost").isin("t","f","null"))
df_target.show()

+-------+--------+--------------------+--------------------+---------------------+--------------------+-----------------+
|     id| host_id|                name|         description|neighborhood_overview|          host_about|host_is_superhost|
+-------+--------+--------------------+--------------------+---------------------+--------------------+-----------------+
|  11869|   44764|Cottage by the Re...|This is a very pr...| This is a quiet n...|Easygoing, enviro...|                f|
|  11879|   44764|Sunny room close ...|Sunny room in a c...|                 null|Easygoing, enviro...|                f|
|  70829|  360285|Master Bedroom w/...|Private entrance,...| Our house is a sh...|As a longtime Air...|                t|
| 264450| 1387567|Walk to everythin...|Comfortable, char...| Our neighborhood ...|Alan and Brit are...|                t|
| 266427|  951619|Cozy, modern cott...|Scandinavian styl...| Quiet safe friend...|We own and love o...|                f|
| 494132|    3177|Privat

## Target Variable

In [None]:
df = df_target.withColumn("target", when(col("host_is_superhost") == 't', 1).otherwise(0))

In [None]:
df.show()

+-------+--------------------+--------------+------------+--------------------+--------------------+---------------------+--------------------+--------+--------------------+----------------+----------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+---------+-----------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+--------------------+--------------------+--------------------+--------------------+------------------

In [None]:
df.groupBy('target')\
  .count()\
  .sort(desc("count"))\
  .show(10, False)

+------+-----+
|target|count|
+------+-----+
|0     |56158|
|1     |30497|
+------+-----+



## Cleaning Airbnb Description

In [None]:
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import nltk

In [None]:
textdf = df.select(col("description"),col("target"))

In [None]:
textdf.head()

Row(description='This is a very private cute cozy small bohemian style retreat next to a creek under the redwood and oak trees. Across the street is a 40 acre park for hiking biking walking frisbee etc. including a dog park<br /><br /><b>The space</b><br />Lovely private setting by the creek, across the street from 40 acre park with trails and playing fields, beach easy walk or bike ride away, shops, restaurants and theater nearby. no traffic very peaceful.<br /><br /><b>Guest access</b><br />There is ample parking in front of the orchard. Bus stop to Santa Cruz or Capitola etc. is a five minute walk down the street.<br /><br /><b>Other things to note</b><br />Recycling and composting as well as minimal use of plastics Are greatly appreciated!', target=0)

In [None]:
df_cleaned = textdf.select(lower(regexp_replace("description", "[^a-zA-Z\\s]", "")).alias("text"))

In [None]:
df_cleaned.head()

Row(text='this is a very private cute cozy small bohemian style retreat next to a creek under the redwood and oak trees across the street is a  acre park for hiking biking walking frisbee etc including a dog parkbr br bthe spacebbr lovely private setting by the creek across the street from  acre park with trails and playing fields beach easy walk or bike ride away shops restaurants and theater nearby no traffic very peacefulbr br bguest accessbbr there is ample parking in front of the orchard bus stop to santa cruz or capitola etc is a five minute walk down the streetbr br bother things to notebbr recycling and composting as well as minimal use of plastics are greatly appreciated')

In [None]:
# tokenize the text data
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df_tokenized = tokenizer.transform(df_cleaned)

In [None]:
# remove stop words from the text data
stop_words = StopWordsRemover.loadDefaultStopWords("english")
stop_words_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens", stopWords=stop_words)
df_filtered = stop_words_remover.transform(df_tokenized)

In [None]:
# show the cleaned and processed text data
df_filtered.head()

Row(text='this is a very private cute cozy small bohemian style retreat next to a creek under the redwood and oak trees across the street is a  acre park for hiking biking walking frisbee etc including a dog parkbr br bthe spacebbr lovely private setting by the creek across the street from  acre park with trails and playing fields beach easy walk or bike ride away shops restaurants and theater nearby no traffic very peacefulbr br bguest accessbbr there is ample parking in front of the orchard bus stop to santa cruz or capitola etc is a five minute walk down the streetbr br bother things to notebbr recycling and composting as well as minimal use of plastics are greatly appreciated', tokens=['this', 'is', 'a', 'very', 'private', 'cute', 'cozy', 'small', 'bohemian', 'style', 'retreat', 'next', 'to', 'a', 'creek', 'under', 'the', 'redwood', 'and', 'oak', 'trees', 'across', 'the', 'street', 'is', 'a', '', 'acre', 'park', 'for', 'hiking', 'biking', 'walking', 'frisbee', 'etc', 'including', '

In [None]:
#Creating new Dataframe with Text and Target Variable
df_filtered = df_filtered.withColumn("target",df.select(col("target")))

TypeError: ignored