In [1]:
import pyspark
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
#Let us create our spark session
spark = (
    SparkSession.builder
        .appName("Stack Overflow Data Wrangling")
        .config("spark.jars", "../jars/postgresql-42.2.8.jar")
        .getOrCreate()
)

In [5]:
#Now let's load all the datasets we'll be using.
answers = spark.read.csv("stackoverflow/answers.csv", header=True, inferSchema=True, multiLine=True)
questions = spark.read.csv("stackoverflow/questions.csv",header=True, inferSchema=True, multiLine=True)
users = spark.read.csv("stackoverflow/users.csv",header=True, inferSchema=True, multiLine=True)
questiontags = spark.read.csv("stackoverflow/question_tags.csv", header=True, inferSchema=True, multiLine=True)

In [10]:
#Function to know the shape of our dataframes
def spark_shape(self):
    return(self.count(), len(self.columns))
pyspark.sql.dataframe.DataFrame.shape = spark_shape

In [13]:
#Let's see the total number of rows and columns
answers.shape()

(9367215, 7)

In [14]:
#Let's see the shape of questions dataframe
questions.shape()

(6773193, 9)

In [15]:
#Let's see shape of users
users.shape()

(273489, 12)

In [16]:
#Let's also see the shape of question_tags
questiontags.shape()

(633700, 2)

In [18]:
#Overview of the columns in users dataframe
users.columns

['id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'created_at',
 'updated_at']

In [21]:
print('Total Records of Users = {}'.format(users.count()))
users.show(2)

Total Records of Users = 273489
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+
|     id|display_name|reputation|         website_url|            location|about_me|views|up_votes|down_votes|           image_url|         created_at|         updated_at|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+
|8357266|      suryan|         7|https://twitter.c...|Bangalore, Karnat...|    null|    8|       0|         0|https://www.grava...|2017-07-24 10:55:23|2019-06-19 05:00:16|
|2602456|         Avi|         1|https://avtechtoo...|              Canada|    null|    0|       0|         0|                null|2013-07-20 15:10:25|2019-07-08 20:43:40|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+---------

In [22]:
#Let's see the distinct countries we have.
countries = users.groupBy('location').count()
print(countries.show())

+--------------------+-----+
|            location|count|
+--------------------+-----+
|  Nowshera, Pakistan|    1|
|           Bangalore|  165|
|San Francisco Bay...|   18|
|Eden Prairie, MN,...|    4|
|     Beograd, Serbia|    4|
|Cluj-Napoca, Cluj...|   33|
|Montreal, Quebec,...|    2|
|                Utah|   46|
| Aalsmeer, Nederland|    1|
|    Tlemcen, Algérie|    2|
|Tirupur, Tamil Na...|    4|
|São Gonçalo, RJ, ...|    1|
|       Suzhou, China|    3|
|Izmir, İzmir, Turkey|   11|
| Bayern, Deutschland|   16|
|       Toruń, Polska|    4|
|Newtown, Kolkata,...|    1|
|  Verona, VR, Italia|   19|
|Santa Marta, Magd...|    1|
|           kathmandu|    5|
+--------------------+-----+
only showing top 20 rows

None


In [33]:
users.select('display_name', 'location').show(10)

+------------------+--------------------+
|      display_name|            location|
+------------------+--------------------+
|            suryan|Bangalore, Karnat...|
|               Avi|              Canada|
|              Matt|Pennsylvania, Uni...|
|          Wing Fan|                null|
|             A.Raw|New Delhi, Delhi,...|
|           Ringo64|                null|
|Hirotaka Nishimiya|          日本 Tōkyō|
|           Anuroop|                null|
|      Franco Buhay|                null|
|     Kartik Juneja|Gharaunda, Haryan...|
+------------------+--------------------+
only showing top 10 rows



In [54]:
#Let's store users coming from Canada in a new dataframe called country
country = users.where(users.location.contains('Canada'))

In [51]:
#Let us see the first 5 of our new dataframe
country.select('display_name','location').show(5)

+----------------+-------------------+
|    display_name|           location|
+----------------+-------------------+
|             Avi|             Canada|
|           0-DAY|             Canada|
|    Jeremy Banks|             Canada|
|        siyi wei|Toronto, ON, Canada|
|Michael Sheinman|Grimsby, ON, Canada|
+----------------+-------------------+
only showing top 5 rows



In [53]:
country.shape()

(3329, 12)