In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [0]:
import pyspark.sql.functions as F

In [0]:
df = (spark
      .read
      .option('header','true')
      .option("nullValue",np.nan)
      .option("nanValue",np.nan)
      .csv('/FileStore/tables/Regression_final.csv')
      )

In [0]:
columns_tokeep = ['Identifier','Component']

In [0]:
#To filter the dataset with Status as 'M','R','V' as per business confirmation
df_filtered = (df
              .filter( (F.col('Status') == "M") |(F.col('Status') == "R") |(F.col('Status') == "V"))
              .filter( (F.col('Severity') == 1) |(F.col('Severity') == 2) |(F.col('Severity') == 3))
               )

In [0]:
print (df_filtered.cache().count(),len(df_filtered.columns))

In [0]:
#replcaed the null values of a target variable with N as per confirmation from business#

df_filtered = df_filtered.fillna('N',subset=['Regression'])

In [0]:
COUNT = df_filtered.cache().count()

In [0]:
#To Remove the columns with null values >50%

isNullStatCol = [F.sum(F.col(c).isNull().cast('int')).alias(c)  for c in df_filtered.columns]
isNullMeanStatCol = [(100*F.mean(F.col(c).isNull().cast('int'))).alias(c)  for c in df_filtered.columns]
na50cols = {k:v for k,v in df_filtered.select(*isNullMeanStatCol).head().asDict().items() if v > 50 and c not in columns_tokeep}
df_filtered3 = df_filtered.select([c for c in df_filtered.columns if c not in na50cols])

In [0]:
print (df_filtered3.cache().count(),len(df_filtered3.columns))

In [0]:
#To Remove the columns with the same value throughout the dataset

nuniqueCol = [F.countDistinct(F.col(c)).alias(c)  for c in df_filtered3.columns]
nunique = {k:v for k,v in df_filtered3.select(*nuniqueCol).head().asDict().items() if v == 1 and k not in columns_tokeep} 
df_filtered4 = df_filtered3.select([c for c in df_filtered3.columns if c not in nunique])

In [0]:
print (df_filtered4.cache().count(),len(df_filtered4.columns))

In [0]:
#To Remove the columns with the unique value >500 throughout the dataset
n500unique = {k:v for k,v in df_filtered3.select(*nuniqueCol).head().asDict().items() if (v >= 500) and (k not in columns_tokeep)} 
df_filtered5 = df_filtered4.select([c for c in df_filtered4.columns if c not in n500unique])


In [0]:
print (df_filtered5.cache().count(),len(df_filtered5.columns))

In [0]:
#To Remove the duplicate colums with same values
df_filtered6=df_filtered5.drop('DE-manager','DEPriorityDesc','DTPT-manager','Dev-escape-activity-display',
                   'Dev-escape-resolver-opinion','DE-manager-org','DTPT-manager-org','Found-during',
                   'Not-customer-visible-reason','Original-found-during','Severity-desc','org','Class')
print df_filtered6.cache().count(),len(df_filtered6.columns)

In [0]:
#To Impute the continous variables with the median 
from pyspark.ml.feature import Imputer
df_filtered7 = (df_filtered6
                         .withColumn("Mean-time-to-resolved",F.col("Mean-time-to-resolved").cast('float'))
                          .withColumn("Mean-time-to-assigned",F.col("Mean-time-to-assigned").cast('float'))
                          .withColumn("Tickets-count",F.col("Tickets-count").cast('int'))
                          .withColumn("Age",F.col("Age").cast('int'))
                          .withColumn("DE-priority",F.col("DE-priority").cast('float'))
                         )

imp = Imputer(inputCols=["Mean-time-to-resolved","Mean-time-to-assigned"], outputCols=["Mean-time-to-resolved","Mean-time-to-assigned"])
df_filtered7 = imp.setStrategy("median").setMissingValue(np.nan).fit(df_filtered7).transform(df_filtered7)


In [0]:
def changeLabels(col,labels= [1,2,3,0],keys=None,mapping={}) :
    if not mapping : 
        if not keys: 
            matching = {i:v for i,v in enumerate(labels)}
        else : 
            matching = {i:v for i,v in zip(keys,labels)}
    else : 
        matching = mapping
    if type (col) == str : 
        col = F.col(col)
    return reduce(lambda y,(src,dest): F.when(col == src,dest ).otherwise(y),matching.items(),None)

In [0]:
#To bin the continous variables MTTR, MTTA and Age since these are having outliers and to drop existing columns
from pyspark.ml.feature import Bucketizer

bins = [-0.1,8,21,40,921]
df_filtered8 = (Bucketizer(splits=bins,inputCol="Mean-time-to-resolved", outputCol='new-MTTR')
                .setHandleInvalid("keep")
                .transform(df_filtered7)
                .withColumn('new-MTTR',changeLabels('new-MTTR',labels=[1,2,3,0]) )
                )
bins = [-0.1,7,16,22,595]
df_filtered9 = (Bucketizer(splits=bins,inputCol="Mean-time-to-assigned", outputCol='new-MTTA')
                .setHandleInvalid("keep")
                .transform(df_filtered8)
                .withColumn('new-MTTA',changeLabels('new-MTTA',labels=[1,2,3,0]) )
                )

bins =  [-0.1,1,8,316]
df_filtered10 = (Bucketizer(splits=bins,inputCol="Tickets-count", outputCol='TC')
                .setHandleInvalid("keep")
                .transform(df_filtered9)
                .withColumn('TC',changeLabels('TC',labels=[1,2,0]) )
                )

bins = [0,10,22,32,922]
df_filtered11 = (Bucketizer(splits=bins,inputCol="Age", outputCol='new-age')
                .setHandleInvalid("keep")
                .transform(df_filtered10)
                .withColumn('new-age',changeLabels('new-age',labels=[1,2,3,0]) )
                )


In [0]:
#to map the levels with the column as per EDA analysis
## This is commented because, when data is filtered for   Status as 'M','R','V' and severity 1,2,3 
## feature column has only one value 'fix' 
## hence, this 'feature' column is removed during pre-processing/cleaning data

#mapping = {'feature':0,'featurette':0,'fix':1,'port':0,'technology':0}
#df_filtered12=df_filtered11.withColumn('Feature',changeLabels('Feature',mapping=mapping) )
df_filtered12 = df_filtered11

In [0]:
#to remove the columns from the dataset as per chi-square contigency test
df_filtered13=df_filtered12.drop('Document','Documents-changed','By-previous-commit-value','Hardware','Data-classification',
                                        'Dev-escape-activity','ORG','Original-activity-when-found','Original-found',
                                        'Is-customer-visible','Test-EDP-activity','WORKAROUND','TPS_flag',
                                        'Project','Breakage','TC','Urgency','Submitter-org','TTA','PSIRT',
                                        'RAT_PSIRT')


In [0]:
print df_filtered13.cache().count(),len(df_filtered13.columns)

In [0]:
for c in df_filtered13.columns: 
    print c,
    tmp = df_filtered13.select(F.sum(F.col(c).isNull().cast('int')).alias('a'),F.sum(F.col(c).cast('float').isNull().cast('int')).alias('b')).head()
    print tmp.a,tmp.b,
    if tmp.a < tmp.b : 
        mode = df_filtered13.groupBy(c).count().orderBy(F.desc('count')).head()[c]
        print mode
        df_filtered13 = df_filtered13.fillna(mode,subset=[c])
    print " -"


In [0]:
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import StringIndexer

#catColumns = ['Bug-origin','Activity-when-found','Automated-test','BadCodeFlag','Behavior-changed','Impact', 
#              'Origin','RELNOTE','Released-code','RNE-approved', 'Status','Test-EDP-phase',  
#              'Regression','Found','org240','Priority','ORG120','Severity', 'Multi-commit-flag', 'Open-status', 'Reti-bug', 'VANALYSIS', 'Related-product']
#indexedColumns = []

#indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df_filtered13) for column in catColumns]


#pipeline = Pipeline(stages=indexers)
#df_r = pipeline.fit(df_filtered13).transform(df_filtered13)

#df_r.show()

In [0]:
#to change from categorical variable to numerical variable using StringIndexer

from pyspark.ml.feature import StringIndexer

catColumns = ['Bug-origin','Activity-when-found','Automated-test','BadCodeFlag','Behavior-changed','Impact','Is-shared-component',
              'Origin','RELNOTE','Released-code','RNE-approved','Sector','Status','Test-EDP-phase','Component','Product',
              'Regression','Found','org240','Priority','ORG120','Severity']
indexedColumns = []
for c in catColumns: 
    try : 
        indexer = StringIndexer(inputCol=c, outputCol=c+"Index") 
        df_filtered13 = indexer.fit(df_filtered13).transform(df_filtered13) 
        indexedColumns.append(c)
    except Exception as e: 
        print c
        print repr(e)
print(indexedColumns)
        

In [0]:
#to remove the identifer from the dataset inorder to do the feature selection
df_filtered15=df_filtered13
# X = df_filtered5.drop(['Regression'], axis=1).values
# y = df_filtered5['Regression'].values

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
to_remove = ["Regression", "RegressionIndex","Identifier"]

In [0]:
assembler = VectorAssembler(
    inputCols=[c+"Index" if c in catColumns else c for c in df_filtered15.columns if c not in to_remove],
    outputCol="features")

In [0]:
df_filtered16 = assembler.transform(df_filtered15)

In [0]:
[c+"Index" if c in indexedColumns else c for c in df_filtered15.columns if not c.startswith("RegressionIndex")][40]

In [0]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=15).fit(df_filtered16)
df_filtered17 = featureIndexer.transform(df_filtered16)

In [0]:
categoricalFeaturesInfo = { k: max(v.keys()) for k,v in featureIndexer.categoryMaps.items()}

In [0]:

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = df_filtered17.randomSplit([0.7, 0.3])

In [0]:
# Train a RandomForest model.
rf = RandomForestClassifier(featuresCol="indexedFeatures",labelCol='RegressionIndex', maxBins=980)


In [0]:

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])


In [0]:

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "RegressionIndex", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="RegressionIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy = %g" % (accuracy))



In [0]:
predictions.groupBy("prediction", "RegressionIndex").count().show()

In [0]:
predictions

In [0]:
predictions.show(5)