In [1]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DoubleType

In [2]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [3]:
spark = SparkSession.builder.appName("pysparkStrIndexer").getOrCreate()

In [None]:
schema = StructType([
        StructField('Rank', IntegerType(), True),
        StructField('Name', StringType(), True),
        StructField('Platform', StringType(), True),
        StructField('Year', IntegerType(), True),
    StructField('Genre', StringType(), True),
    StructField('Publisher', StringType(), True),
    StructField('NA_Sales', DoubleType(), True),
    StructField('EU_Sales', DoubleType(), True),
    StructField('JP_Sales', DoubleType(), True),
    StructField('Other_Sales', DoubleType(), True),
    StructField('Global_Sales', DoubleType(), True)
        ])

In [None]:
schema = StructType([
        StructField("country_or_area", StringType(), True),
        StructField("year", IntegerType(), True),
        StructField("value", DoubleType(), True),
        StructField("category", StringType(), True)
        ])

In [4]:
df = spark.read.csv('greenhouse_gas_inventory_data_data.csv',inferSchema=True,header='true')

In [8]:
df.summary().toPandas()

Unnamed: 0,summary,country_or_area,year,value,category,country_or_area_index,category_index
0,count,8406,8406.0,8406.0,8406,8406.0,8406.0
1,mean,,2002.1884368308351,205547.19137087325,,19.544848917439925,3.523316678562931
2,stddev,,7.151604788310116,804150.3770358328,,12.356062857827885,2.3997553397751763
3,min,Australia,1990.0,1.10375e-06,carbon_dioxide_co2_emissions_without_land_use_...,0.0,0.0
4,25%,,1996.0,297.670751865432,,9.0,1.0
5,50%,,2002.0,7839.9346862074,,19.0,3.0
6,75%,,2014.0,7422207.96295139,,42.0,9.0
7,max,United States of America,2014.0,7422207.96295139,unspecified_mix_of_hydrofluorocarbons_hfcs_and...,42.0,9.0


In [None]:
summary = df.select('NA_Sales').summary()
summary.show()

In [6]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) 
            for column in list(["country_or_area","category"]) ]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

In [9]:
df.select(["country_or_area_index","category_index"]).summary().show()

+-------+---------------------+------------------+
|summary|country_or_area_index|    category_index|
+-------+---------------------+------------------+
|  count|                 8406|              8406|
|   mean|   19.544848917439925| 3.523316678562931|
| stddev|   12.356062857827883|2.3997553397751763|
|    min|                  0.0|               0.0|
|    25%|                  9.0|               1.0|
|    50%|                 19.0|               3.0|
|    75%|                 42.0|               9.0|
|    max|                 42.0|               9.0|
+-------+---------------------+------------------+



In [None]:
summ = summary.collect()
print(summ[6][1]) #75%

In [None]:
def discretization(attr):
    inp = list(df.select(attr).collect())
    midi = df.select(attr).summary().collect()
    out = []
    for x in inp:
        if(x[0]<=float(midi[4][1])):
            out.append(0)
        elif(x[0]>float(midi[4][1]) and x[0]<=float(midi[5][1])):
            out.append(1)
        elif(x[0]>float(midi[5][1]) and x[0]<=float(midi[6][1])):
            out.append(2)
        else:
            out.append(3)
    return out

In [None]:
def multiDiscretization(attrs,df):
    label = []
    out = []
    for x in attrs:
        label.append(x+'_disc')
        out.append(discretization(x))
    out = list(map(list,zip(*out))) #transpose list column menjadi row
    df = spark.createDataFrame(out,label)
    return df

In [None]:
df2 = multiDiscretization(['NA_Sales','EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],df)

In [None]:
df2.columns

In [None]:
df2.show()

In [None]:
df.join(df2)