In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS').getOrCreate()

In [2]:
# create correct data schema.
from pyspark.sql.types import *
schema = StructType([
               StructField('year',IntegerType(),True),
               StructField('month',IntegerType(),True),
               StructField('day',IntegerType(),True),
               StructField('hour',IntegerType(),True),
               StructField('PM25',FloatType(),True),
               StructField('SO2',FloatType(),True),
               StructField('NO2',FloatType(),True),
               StructField('CO',FloatType(),True),
               StructField('O3',FloatType(),True),
               StructField('TEMP',FloatType(),True),
               StructField('PRES',FloatType(),True),
               StructField('DewPointTempeature',FloatType(),True),
               StructField('RAIN',FloatType(),True),
               StructField('WindDirection',StringType(),True),
               StructField('WindSpend',FloatType(),True),
               StructField('station',StringType(),True),
               StructField('NQR',StringType(),True),
               StructField('Season',StringType(),True),
                ])

In [3]:
df = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("../final_dataset/final.csv")


In [4]:
df.filter(df.NQR == 'Very Poor').show()


+----+-----+---+----+----+---+---+---+---+----+----+------------------+----+-------------+---------+-------+---+------+
|year|month|day|hour|PM25|SO2|NO2| CO| O3|TEMP|PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|station|NQR|Season|
+----+-----+---+----+----+---+---+---+---+----+----+------------------+----+-------------+---------+-------+---+------+
+----+-----+---+----+----+---+---+---+---+----+----+------------------+----+-------------+---------+-------+---+------+



In [5]:
from pyspark.ml.feature import StringIndexer


In [6]:
indexer = StringIndexer(inputCol ='WindDirection', outputCol= 'WindDirectionIndex')
model = indexer.fit(df)
indexed = model.transform(df)


In [7]:
column = ['WindDirection','station','NQR','Season']
for i in column:
    indexer = StringIndexer(inputCol =i, outputCol= i+"Index" )
    model = indexer.fit(df)
    indexed = model.transform(df)
    df = indexed

In [8]:
new_df = df.drop(*column)

In [9]:
new_df.show(5)

+----+-----+---+----+----+----+----+-----+----+----+------+------------------+----+---------+------------------+------------+--------+-----------+
|year|month|day|hour|PM25| SO2| NO2|   CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindSpend|WindDirectionIndex|stationIndex|NQRIndex|SeasonIndex|
+----+-----+---+----+----+----+----+-----+----+----+------+------------------+----+---------+------------------+------------+--------+-----------+
|2015|    1|  1|   2| 3.0|10.0|15.0|400.0|55.0| 0.0|1027.0|             -22.9| 0.0|      2.4|               1.0|         7.0|     0.0|        0.0|
|2015|    1|  1|   3| 4.0|13.0|13.0|400.0|57.0| 0.0|1028.0|             -24.4| 0.0|      2.4|               1.0|         7.0|     0.0|        0.0|
|2015|    1|  1|   4| 4.0|15.0|12.0|400.0|58.0| 0.0|1030.0|             -24.4| 0.0|      2.4|               1.0|         7.0|     0.0|        0.0|
|2015|    1|  1|   5| 3.0|13.0|15.0|500.0|55.0|-1.0|1024.0|             -24.4| 0.0|      3.2|              11.0|      

In [9]:
from pyspark.ml.feature import RFormula
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(new_df).transform(new_df)
feature_selection = output.select("features", "label")

In [10]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

selector = ChiSqSelector(numTopFeatures=4, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="label")

result = selector.fit(feature_selection).transform(feature_selection)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.select('features','label','selectedFeatures').show(100,truncate = True)

ChiSqSelector output with top 4 features selected
+--------------------+-----+--------------------+
|            features|label|    selectedFeatures|
+--------------------+-----+--------------------+
|[2015.0,1.0,1.0,2...|  0.0|[2015.0,1.0,1.0,2.0]|
|[2015.0,1.0,1.0,3...|  0.0|[2015.0,1.0,1.0,3.0]|
|[2015.0,1.0,1.0,4...|  0.0|[2015.0,1.0,1.0,4.0]|
|[2015.0,1.0,1.0,5...|  0.0|[2015.0,1.0,1.0,5.0]|
|[2015.0,1.0,1.0,6...|  0.0|[2015.0,1.0,1.0,6.0]|
|[2015.0,1.0,1.0,8...|  0.0|[2015.0,1.0,1.0,8.0]|
|[2015.0,1.0,1.0,9...|  0.0|[2015.0,1.0,1.0,9.0]|
|[2015.0,1.0,1.0,1...|  0.0|[2015.0,1.0,1.0,1...|
|[2015.0,1.0,1.0,1...|  0.0|[2015.0,1.0,1.0,1...|
|[2015.0,1.0,1.0,1...|  0.0|[2015.0,1.0,1.0,1...|
|[2015.0,1.0,1.0,1...|  0.0|[2015.0,1.0,1.0,1...|
|[2015.0,1.0,1.0,1...|  0.0|[2015.0,1.0,1.0,1...|
|[2015.0,1.0,1.0,1...|  0.0|[2015.0,1.0,1.0,1...|
|[2015.0,1.0,2.0,4...|  0.0|[2015.0,1.0,2.0,4.0]|
|[2015.0,1.0,2.0,5...|  0.0|[2015.0,1.0,2.0,5.0]|
|[2015.0,1.0,2.0,6...|  0.0|[2015.0,1.0,2.0,6.0]|


In [35]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

df = feature_selection.select("features")
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
result = model.transform(df).select('pcaFeatures')
result.show(10000)

+--------------------+
|         pcaFeatures|
+--------------------+
|[-399.98936435568...|
|[-399.94978766972...|
|[-399.90705608381...|
|[-499.72533264070...|
|[-500.62130713173...|
|[-601.96350215983...|
|[-601.96889955344...|
|[-900.98755238998...|
|[-1300.8778593438...|
|[-1301.9798356317...|
|[-900.89627988641...|
|[-1000.9804823758...|
|[-1301.2657957623...|
|[-1503.4104750974...|
|[-1502.5297898007...|
|[-1001.7293461418...|
|[-901.56327035560...|
|[-901.98102274676...|
|[-802.00862703902...|
|[-801.16662038660...|
|[-700.83624439118...|
|[-600.99467699029...|
|[-600.67089605485...|
|[-601.11233873865...|
|[-1001.8378309365...|
|[-500.96017474370...|
|[-500.71895931799...|
|[-400.20819981436...|
|[-300.44260326366...|
|[-299.95502767607...|
|[-300.11700046094...|
|[-400.21164455714...|
|[-400.10534277289...|
|[-400.17453127182...|
|[-400.31538808383...|
|[-300.31641470306...|
|[-300.54423164332...|
|[-501.47473582346...|
|[-601.90612260895...|
|[-702.29627483212...|
|[-702.0455

In [12]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=int(2), maxDepth=int(4), labelCol='label', seed=11)
model = rf.fit(feature_selection)

In [13]:
ff=model.featureImportances
importancesList=[float(col) for col in  ff]
colList = new_df.columns
result=dict(zip(colList,importancesList))
print(result)

{'SO2': 0.0008204226356877305, 'TEMP': 0.0, 'WindDirectionIndex': 0.001688752584229212, 'year': 0.0, 'WindSpend': 0.019777215140169512, 'PRES': 0.0, 'CO': 0.10418290298996702, 'DewPointTempeature': 0.006172533666245003, 'NQRIndex': 0.0, 'NO2': 0.0015246934803152445, 'O3': 0.0, 'month': 0.0, 'hour': 0.0002349291764291249, 'stationIndex': 0.0, 'RAIN': 0.0, 'day': 0.0, 'PM25': 0.8655985503269572}


In [16]:
from pyspark.sql.functions import col, explode, array, lit
df_0 = feature_selection.filter(col("label") == 0)
df_1 = feature_selection.filter(col("label") == 1)

ratio = int(df_0.count()/df_1.count())
print(ratio)

2


In [17]:
a = range(ratio)
oversampled_df = df_1.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
combined_df = df_0.unionAll(oversampled_df)
df_0 = combined_df.filter(col("label") == 0)
df_1 = combined_df.filter(col("label") == 1)
print(df_0.count(),df_1.count())

85796 73384


In [15]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (Vectors.dense([2015.0,1.0,1.0,2.0,3.0,10.0,15.0,400.0,55.0,0.0,1027.0,-22.899999618530273,0.0,2.4000000953674316,1.0,7.0,0.0]), 0.0,),
    (Vectors.dense([2015.0,1.0,1.0,3.0,4.0,13.0,13.0,400.0,57.0,0.0,1028.0,-24.399999618530273,0.0,2.4000000953674316,1.0,7.0,0.0]), 0.0,),
    (Vectors.dense([2015.0,1.0,6.0,18.0,60.0,28.0,74.0,1100.0,20.0,0.0,1016.0,-18.700000762939453,0.0,0.8999999761581421,13.0,7.0,0.0]), 1.0,)], 
    [ "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=5, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="clicked")

result = selector.fit(feature_selection).transform(feature_selection)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show(truncate = False)

IllegalArgumentException: 'Field "clicked" does not exist.'

In [None]:
df.show()

In [None]:
def functionname(value):
    return Vectors.dense(value)

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.linalg import VectorUDT
udfsomefunc = F.udf(functionname, VectorUDT())
featurestest = feature_selection.withColumn("testfeature", udfsomefunc("features"))
featurestest.show()

In [None]:
selector = ChiSqSelector(numTopFeatures=4, featuresCol="testfeature",
                         outputCol="selectedFeatures", labelCol="label")

result = selector.fit(featurestest).transform(featurestest)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show(truncate = False)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
dealWithFeatures =  VectorAssembler()


In [None]:
data_columns = new_df.columns
data_columns = [x for x in data_columns if x not in ['day','PM25','NQRIndex']]


In [None]:
data = dealWithFeatures.(Vectors.dense(setInputCols(data_columns))).setOutputCol("features").transform(new_df);

In [None]:
new_data_columns = new_df.columns
new_data_columns =  [x for x in data_columns if x not in ['features','NQRIndex']]
print(new_data_columns)
data = data.drop(*new_data_columns)

In [None]:
data.show(30, truncate = False)

In [None]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
selector = ChiSqSelector(numTopFeatures=16, featuresCol="features",outputCol="selectedFeatures", labelCol="NQRIndex")
result = selector.fit(data).transform(data)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show(30, truncate = False)

In [None]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
    (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
    (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "Select"])

selector = ChiSqSelector(numTopFeatures=2, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="Select")

result = selector.fit(df).transform(df)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show(10, truncate = False)

In [None]:
result