In [6]:
from pyspark import SparkConf, SparkContext

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, DateType, DecimalType, DoubleType
from pyspark.sql import functions as F

In [7]:
spark = SparkSession \
    .builder \
    .appName("init") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/hotel.raw") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1') \
    .getOrCreate()

In [8]:
schema = StructType([
    StructField("Hotel_Address", StringType(), True),
    StructField("Additional_Number_of_Scoring", IntegerType(), True),
    StructField("Review_Date", TimestampType(), True),
    StructField("Average_Score", DoubleType(), True),
    StructField("Hotel_Name", StringType(), True),
    StructField("Reviewer_Nationality", StringType(), True),
    StructField("Negative_Review", StringType(), True),
    StructField("Review_Total_Negative_Word_Counts", IntegerType(), True),
    StructField("Total_Number_of_Reviews", IntegerType(), True),
    StructField("Positive_Review", StringType(), True),
    StructField("Review_Total_Positive_Word_Counts", IntegerType(), True),
    StructField("Total_Number_of_Reviews_Reviewer_Has_Given", IntegerType(), True),
    StructField("Reviewer_Score", DoubleType(), True),
    StructField("Tags", StringType(), True),
    StructField("days_since_review", StringType(), True),
    StructField("lat", DecimalType(9,7), True),
    StructField("lng", DecimalType(8,7), True),
])

raw_reviews = spark \
    .read.format("csv") \
    .schema(schema) \
    .option("dateFormat", "MM/dd/yyyy") \
    .option("timestampFormat", "MM/dd/yyyy") \
    .option("header", "true") \
    .load("Hotel_Reviews.csv")

In [9]:
raw_reviews.show()

+--------------------+----------------------------+-------------------+-------------+-----------+--------------------+--------------------+---------------------------------+-----------------------+--------------------+---------------------------------+------------------------------------------+--------------+--------------------+-----------------+----------+---------+
|       Hotel_Address|Additional_Number_of_Scoring|        Review_Date|Average_Score| Hotel_Name|Reviewer_Nationality|     Negative_Review|Review_Total_Negative_Word_Counts|Total_Number_of_Reviews|     Positive_Review|Review_Total_Positive_Word_Counts|Total_Number_of_Reviews_Reviewer_Has_Given|Reviewer_Score|                Tags|days_since_review|       lat|      lng|
+--------------------+----------------------------+-------------------+-------------+-----------+--------------------+--------------------+---------------------------------+-----------------------+--------------------+---------------------------------+------

In [10]:
# write raw reviews to mongo
raw_reviews.write.format("mongo").mode("append").save()