In [None]:
import os
exec(open(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py')).read())

#### Add the config file within the spark context

In [2]:
sc.addFile('/Users/sumangangopadhyay/complex-spark-transformations/config.py')

#### Import the relevant libraries

In [3]:
import config as cf
from pyspark.sql import functions as func

#### Get the relevant configurations in variables

In [4]:
data_path = cf.data_path()
primary_response_variables = cf.primary_response_variables().split(',')
secondary_response_variables = cf.secondary_response_variables().split(',')
primary_explanatory_variables = cf.primary_explanatory_variables().split(',')

#### Read the data

In [5]:
df = spark.read.csv(data_path, header=True)

#### Remove the spaces from the column names so that it's easier to use the columns later on

In [6]:
df_with_no_spaces_in_colm_names = df.select([func.col(col).alias(col.replace(' ', '_')) for col in df.columns])

#### Get the count of distinct values of the attributes which form the response variables ( In statistical terms, response variables are the variables on the y-axis, i.e. the variables whose variations are being observed)

In [7]:
unique_count_of_primary_response_variables = df_with_no_spaces_in_colm_names\
.select([func.countDistinct(col)\
         .alias('unique_'+ col) for col in primary_response_variables])

In [8]:
unique_count_of_primary_response_variables.show()

21/10/15 18:39:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------------------------+-----------------+---------------------+------------------+---------------------------+---------------------+
|unique_Registration_State|unique_Plate_Type|unique_Violation_Code|unique_Law_Section|unique_Violation_Legal_Code|unique_Issuing_Agency|
+-------------------------+-----------------+---------------------+------------------+---------------------------+---------------------+
|                       67|               86|                  100|                 8|                          4|                   17|
+-------------------------+-----------------+---------------------+------------------+---------------------------+---------------------+



                                                                                

In [9]:
unique_count_of_secondary_response_variables = df_with_no_spaces_in_colm_names\
.select([func.countDistinct(col)\
         .alias('unique_'+ col) for col in secondary_response_variables])

In [10]:
unique_count_of_secondary_response_variables.show()



+-----------------------+-------------------+-------------------+
|unique_Violation_County|unique_Issuer_Squad|unique_Vehicle_Year|
+-----------------------+-------------------+-------------------+
|                     12|                 49|                100|
+-----------------------+-------------------+-------------------+



                                                                                

#### Check the NaNs and Nulls in the explanatory variables. These variables typically go in the x-axis. Statistically, we are interested in the extent to which the variation in the response variables are associated with the variation in these variables

In [11]:
nan_null_count_in_primary_explanatory_variables = df_with_no_spaces_in_colm_names\
.select([func.count(func.when(func.isnan(col) | func.col(col).isNull(), col))\
         .alias('null_nan_count_'+ col) for col in primary_explanatory_variables])

In [12]:
nan_null_count_in_primary_explanatory_variables.show()

[Stage 7:>                                                        (0 + 16) / 16]

+-------------------------+-----------------------------+
|null_nan_count_Issue_Date|null_nan_count_Violation_Time|
+-------------------------+-----------------------------+
|                        0|                           63|
+-------------------------+-----------------------------+





#### Creating a new categorical explanatory variable (Categorical variables are factors with 2 or more levels, e.g. a rainbow is a factor with 7 levels)

In [13]:
df_with_no_spaces_in_colm_names = df_with_no_spaces_in_colm_names\
.withColumn('Violation_AM_or_PM', \
            func.when(func.isnan(df_with_no_spaces_in_colm_names.Violation_Time) \
                      | func.col('Violation_Time').isNull()\
                      , func.lit(None))\
            .otherwise(func.substring(df_with_no_spaces_in_colm_names.Violation_Time,5,1)))

#### Verify that the new column has been populated correctly

In [14]:
df_with_no_spaces_in_colm_names\
.select('Violation_Time','Violation_AM_or_PM')\
.filter(df_with_no_spaces_in_colm_names.Violation_Time.isNotNull())\
.show(10)

+--------------+------------------+
|Violation_Time|Violation_AM_or_PM|
+--------------+------------------+
|         0143A|                 A|
|         0400P|                 P|
|         0233P|                 P|
|         1120A|                 A|
|         0555P|                 P|
|         0852P|                 P|
|         0215A|                 A|
|         0758A|                 A|
|         1005A|                 A|
|         0845A|                 A|
+--------------+------------------+
only showing top 10 rows



In [15]:
df_with_no_spaces_in_colm_names\
.select('Violation_Time','Violation_AM_or_PM')\
.filter(df_with_no_spaces_in_colm_names.Violation_Time.isNull())\
.show(10)

[Stage 11:>                                                         (0 + 4) / 4]

+--------------+------------------+
|Violation_Time|Violation_AM_or_PM|
+--------------+------------------+
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
|          null|              null|
+--------------+------------------+
only showing top 10 rows



                                                                                

In [16]:
df_with_no_spaces_in_colm_names\
.select('Violation_Time','Violation_AM_or_PM')\
.filter(func.isnan(df_with_no_spaces_in_colm_names.Violation_Time))\
.show(10)

[Stage 14:>                                                       (0 + 11) / 11]

+--------------+------------------+
|Violation_Time|Violation_AM_or_PM|
+--------------+------------------+
+--------------+------------------+



