In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FinalProject").getOrCreate()

In [2]:
sc

In [3]:
rc = spark.read.format('json').load('s3://jk2060/final_project')
rc.printSchema()

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_created_utc: long (nullable = true)
 |-- author_flair_background_color: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_richtext: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- a: string (nullable = true)
 |    |    |-- e: string (nullable = true)
 |    |    |-- t: string (nullable = true)
 |    |    |-- u: string (nullable = true)
 |-- author_flair_template_id: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_flair_text_color: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-

In [16]:
rc.count()

30520027

In [15]:
# seeing how many rows there are when there are subreddits with a gilded score greater than 0.
test = rc.where(col('gilded') > 0)
test.count()

9097

In [19]:
# getting percentage of guilded scores
(9097/30520027)*100

0.029806657772615992

In [21]:
print('As we can see, in our dataset, only 0.0298% of subreddits are gilded.')

As we can see, in our dataset, only 0.0298% of subreddits are guilded


In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
from pyspark.ml import Pipeline
from matplotlib import pyplot as plt

# Creating a Random Forest ML to see if we can predict the gilded score from the reddit score

In [15]:
def RandomForestRegression(df,featuresCol,labelCol):
    
    (TrainDF,TestDF)=df.randomSplit([0.8,0.2])
    
    df_RFR = RandomForestRegressor(featuresCol=featuresCol,labelCol=labelCol)
    
    model = df_RFR.fit(TrainDF)
    
    predictions = model.transform(TestDF)
    
    return predictions

In [26]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['score'], outputCol = 'features')

In [69]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.evaluation import RegressionEvaluator

rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'gilded')


In [70]:
pipeline = Pipeline(stages = [assembler, rf])

In [71]:
(training, test) = rc.randomSplit([0.8, 0.2])
model = pipeline.fit(training)

In [72]:
predictions = model.transform(test)

In [73]:
evaluator = RegressionEvaluator(labelCol = 'gilded', predictionCol = 'prediction', metricName = 'rmse')
rmse = evaluator.evaluate(predictions)
print('The RMSE on test data is %g' % rmse)

The RMSE on test data is 0.02481


In [11]:
## We will now look at what the highest scored subreddit is in our dataset.

In [5]:
rc.createOrReplaceTempView('rc')

In [6]:
max_score = spark.sql("\
SELECT MAX(score), gilded, subreddit \
FROM rc \
GROUP BY subreddit, gilded \
ORDER BY MAX(score) DESC").show()


+----------+------+-----------------+
|max(score)|gilded|        subreddit|
+----------+------+-----------------+
|     53571|     0|        AskReddit|
|     53424|     5|        AskReddit|
|     50985|     1|        AskReddit|
|     49654|     3|        AskReddit|
|     40194|     2|        AskReddit|
|     40043|     4|        AskReddit|
|     39586|     1|             tifu|
|     33755|     7|        AskReddit|
|     33464|     0|              WTF|
|     30865|    33|           videos|
|     28902|     0|    todayilearned|
|     28371|     1|             gifs|
|     27893|     0|           movies|
|     27620|     1|interestingasfuck|
|     25858|     4|             pics|
|     25760|     5|    AmItheAsshole|
|     23889|     1|           videos|
|     23401|     0|mildlyinteresting|
|     23267|     0|             news|
|     23020|     2|          RoastMe|
+----------+------+-----------------+
only showing top 20 rows

The Max score for our dataset is 53571 which is about AskReddi

In [8]:
max_score_about = spark.sql("\
SELECT body, score \
FROM rc \
WHERE score = 53571").show(truncate = False)

print("The Max score for our dataset is 53571 which is AskReddit subreddit. This has a gilded score of 0. Above is the actual comment.")

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|body                                                                                                                                                                                                                                                    |score|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|My roommate would get up and steal the shower as soon as he heard my alarm go off.  6 weeks later, I had him waking up to shower at 4:30.  I would just turn off my pavlovian alarm and go back to sleep for another 3 hours waiting

In [43]:
sc.stop()