In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS_Datass transformation').getOrCreate()

In [2]:
from pyspark.sql.types import *
schema = StructType([
               StructField('year',IntegerType(),True),
               StructField('month',IntegerType(),True),
               StructField('day',IntegerType(),True),
               StructField('hour',IntegerType(),True),
               StructField('PM25',FloatType(),True),
               StructField('SO2',FloatType(),True),
               StructField('NO2',FloatType(),True),
               StructField('CO',FloatType(),True),
               StructField('O3',FloatType(),True),
               StructField('TEMP',FloatType(),True),
               StructField('PRES',FloatType(),True),
               StructField('DewPointTempeature',FloatType(),True),
               StructField('RAIN',FloatType(),True),
               StructField('WindDirection',StringType(),True),
               StructField('WindSpend',FloatType(),True),
               StructField('station',StringType(),True),
               StructField('NQR',StringType(),True),
               StructField('Season',StringType(),True),
                ])

In [3]:
file = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("../final_dataset/finishedpreprocessing.csv")

In [18]:
print(type(file))

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
df.show()

In [5]:
df = df.drop(*["PM25","day"])
df.show()

+----+-----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+------+
|year|month|hour| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|         NQR|Season|
+----+-----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+------+
|2015|    1|   0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|        Good|Winter|
|2015|    1|   1|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|        Good|Winter|
|2015|    1|   2|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|        Good|Winter|
|2015|    1|   3|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|        Good|Winter|
|2015|    1|   4|15.0| 12.0| 400.0|58.0| 0.0|1030.0|   

In [6]:
from pyspark.ml.feature import StringIndexer
column = ['WindDirection','station','NQR','Season']
for i in column:
    indexer = StringIndexer(inputCol =i, outputCol= i+"Index" )
    model = indexer.fit(df)
    indexed = model.transform(df)
    df = indexed
new_df = df.drop(*column)

In [None]:
new_df.show()

In [7]:
from pyspark.ml.feature import RFormula
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(new_df).transform(new_df)
vectorformat = output.select("features", "label")

In [None]:
vectorformat.show(truncate = False)

In [8]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

selector = ChiSqSelector(numTopFeatures=16, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="label")

result = selector.fit(vectorformat).transform(vectorformat)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())


ChiSqSelector output with top 16 features selected


In [9]:
result.select('features','label','selectedFeatures').show(20,truncate = False)

+-----------------------------------------------------------------------------------------------------------+-----+-----------------------------------------------------------------------------------------------------------+
|features                                                                                                   |label|selectedFeatures                                                                                           |
+-----------------------------------------------------------------------------------------------------------+-----+-----------------------------------------------------------------------------------------------------------+
|[2015.0,1.0,0.0,10.0,16.0,400.0,54.0,-1.0,1027.0,-23.0,0.0,0.8999999761581421,1.0,4.0,0.0]                 |0.0  |[2015.0,1.0,0.0,10.0,16.0,400.0,54.0,-1.0,1027.0,-23.0,0.0,0.8999999761581421,1.0,4.0,0.0]                 |
|[2015.0,1.0,1.0,11.0,17.0,400.0,53.0,0.0,1025.0,-22.899999618530273,0.0,2.700000047683716,1.0,4.0,0.0] 

In [10]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=int(9), maxDepth=int(5), labelCol='label', seed=11)
model = rf.fit(vectorformat)

In [11]:
ff=model.featureImportances
importancesList=[float(col) for col in  ff]
colList = new_df.columns
result=dict(zip(colList,importancesList))
print(result)

{'NQRIndex': 0.0, 'year': 0.00013154716809603672, 'DewPointTempeature': 0.0884079368264622, 'stationIndex': 0.001036175290576831, 'NO2': 0.1810933064930174, 'CO': 0.39062896097298794, 'WindDirectionIndex': 0.02110121663588012, 'month': 0.009670148938959573, 'TEMP': 0.024795955307260142, 'hour': 0.0005757787124201702, 'PRES': 0.02374150268508552, 'O3': 0.046052694563309626, 'RAIN': 0.0, 'WindSpend': 0.002673793209110729, 'SO2': 0.2100909831968337}


In [None]:
def sort_by_value(d): 
    items=d.items() 
    backitems=[[v[1],v[0]] for v in items] 
    backitems.sort() 
    return [ backitems[i][1] for i in range(0,len(backitems))]

In [None]:
dic = sort_by_value(result)
dic.reverse()
print(dic)

In [12]:
after_feature = new_df.drop(*['hour','RAIN','year'])
after_feature.show()
print("Number of attribute: ",len(after_feature.columns))

+-----+----+-----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|month| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|WindSpend|WindDirectionIndex|stationIndex|NQRIndex|SeasonIndex|
+-----+----+-----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|    1|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0|      0.9|               1.0|         4.0|     0.0|        0.0|
|    1|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9|      2.7|               1.0|         4.0|     0.0|        0.0|
|    1|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9|      2.4|               1.0|         4.0|     0.0|        0.0|
|    1|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4|      2.4|               1.0|         4.0|     0.0|        0.0|
|    1|15.0| 12.0| 400.0|58.0| 0.0|1030.0|             -24.4|      2.4|               1.0|         4.0|     0.0|        0.0|


In [None]:
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(after_feature).transform(after_feature)
vectorformat2 = output.select("features", "label")

In [13]:
from pyspark.sql.functions import col, explode, array, lit
df_0 = after_feature.filter(col("NQRIndex") == 0)
df_1 = after_feature.filter(col("NQRIndex") == 1)
df_2 = after_feature.filter(col("NQRIndex") == 2)
df_3 = after_feature.filter(col("NQRIndex") == 3)
df_4 = after_feature.filter(col("NQRIndex") == 4)
df_5 = after_feature.filter(col("NQRIndex") == 5)

print(df_0.count(),df_1.count(),df_2.count(),df_3.count(),df_4.count(),df_5.count())

108279 49511 38691 12271 3658 1791


In [14]:
count_list = [df_1,df_2,df_3,df_4,df_5]
combined_df = df_0
for element in count_list:
    ratio = int(df_0.count()/element.count())
    a = range(ratio)
    oversampled_df = element.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
    combined_df = combined_df.unionAll(oversampled_df)

In [20]:
df_0 = combined_df.filter(col("NQRIndex") == 0)
df_1 = combined_df.filter(col("NQRIndex") == 1)
df_2 = combined_df.filter(col("NQRIndex") == 2)
df_3 = combined_df.filter(col("NQRIndex") == 3)
df_4 = combined_df.filter(col("NQRIndex") == 4)
df_5 = combined_df.filter(col("NQRIndex") == 5)

print(df_0.count(),df_1.count(),df_2.count(),df_3.count(),df_4.count(),df_5.count())

108279 99022 77382 98168 106082 107460


In [19]:
file = combined_df
file.write.csv("../final_dataset/view2.csv",header = True)

In [15]:
print(combined_df.count())

596393


In [22]:
combined_df.printSchema()
combined_df.show()

root
 |-- month: integer (nullable = true)
 |-- SO2: float (nullable = true)
 |-- NO2: float (nullable = true)
 |-- CO: float (nullable = true)
 |-- O3: float (nullable = true)
 |-- TEMP: float (nullable = true)
 |-- PRES: float (nullable = true)
 |-- DewPointTempeature: float (nullable = true)
 |-- WindSpend: float (nullable = true)
 |-- WindDirectionIndex: double (nullable = true)
 |-- stationIndex: double (nullable = true)
 |-- NQRIndex: double (nullable = true)
 |-- SeasonIndex: double (nullable = true)

+-----+----+----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|month| SO2| NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|WindSpend|WindDirectionIndex|stationIndex|NQRIndex|SeasonIndex|
+-----+----+----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|    1|10.0|16.0| 400.0|54.0|-1.0|1027.0|             -23.0|      0.9|               1.0|         4.0|     0.0|   

In [23]:
combined_df.count()

596393

In [24]:
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(combined_df).transform(combined_df)
vectorformat3 = output.select("features", "label")

In [25]:
vectorformat3.show()
vectorformat3.count()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,10.0,16.0,40...|  0.0|
|[1.0,11.0,17.0,40...|  0.0|
|[1.0,10.0,15.0,40...|  0.0|
|[1.0,13.0,13.0,40...|  0.0|
|[1.0,15.0,12.0,40...|  0.0|
|[1.0,13.0,15.0,50...|  0.0|
|[1.0,10.0,30.0,50...|  0.0|
|[1.0,10.0,41.0,60...|  0.0|
|[1.0,10.0,55.0,60...|  0.0|
|[1.0,11.0,56.0,60...|  0.0|
|[1.0,15.0,56.0,90...|  0.0|
|[1.0,26.0,64.0,13...|  0.0|
|[1.0,42.0,76.0,17...|  0.0|
|[1.0,29.0,59.0,13...|  0.0|
|[1.0,18.0,33.0,90...|  0.0|
|[1.0,19.0,39.0,10...|  0.0|
|[1.0,23.0,58.0,13...|  0.0|
|[1.0,28.0,91.0,15...|  0.0|
|[1.0,23.0,86.0,15...|  0.0|
|[1.0,17.0,62.0,10...|  0.0|
+--------------------+-----+
only showing top 20 rows



596393