In [1]:
"""
Author - Yogesh Agrawal
Email -  Yogesh.agrawal@mindtree.com
"""

In [2]:
"""
Loading important package of spark 
"""
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.ml.pipeline import Transformer,Estimator
from pyspark.ml.feature import StringIndexer,VectorAssembler,QuantileDiscretizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from  pyspark.ml.param.shared import *

In [3]:
"""
Spark session creater 
"""

spark = SparkSession \
        .builder \
        .appName('Titanic') \
        .getOrCreate()

In [4]:
"""
Load data function for loading data..
@param - 
        path - path of file
        header_value - header value, incase true first row will be header
        
@return - dataframe of loaded intended data.
"""

def load_data(path,header_value):
  df = spark.read.csv(path,inferSchema=True,header=header_value)
  return df

In [5]:
df = load_data('/FileStore/tables/titanic_train.csv',True)
df_test = load_data('/FileStore/tables/titanic_test.csv',True)

In [6]:
df_test.show(5)

In [7]:
"""
check null value each column wise.
@param - 
          df - a dataframe 
"""
def check_column_null(df):
  df.select([count(when(col(c).isNull(),c)).alias(c) for c in df.columns]).show()

In [8]:
check_column_null(df)
check_column_null(df_test)

In [9]:
'''
Class A for sharing varible between Estimator and Transformer class.

@param - 
       Params - A object that need to be shared between two class. ( primarily here Estimator logic object )

@return -
       Param - A object to the Transformer class.

'''

class A(Params):  
  
  center_param = Param(Params._dummy(),"center_param","center_param")
  
  def __init__(self):
    super(A,self).__init__()
     
  def setCenterObject(self,value):
    return self._set(center_param = value)
  
  def getCenterObject(self):
    return self.getOrDefault(self.center_param)

In [10]:
'''
Custom Estimator class for logic implementation .

@param - 
       Estimator - Estimator class refrence 
       df - dataframe in which operation need to be carried ( passed through fit function)

@return -
       Model - a Transformer model for transforming , estimator implemenatation. 

'''

class My_preprocessing_Estimator(Estimator):
      
    def _fit(self,df):
      print("********************************  in fit method ...************************************")
      
      self.df = df
      self.df = self.df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))
      self.df = self.df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
                     ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

      self.Age_mean = self.df.groupBy("Initial").avg('Age')
      self.Age_mean = self.Age_mean.withColumnRenamed('avg(Age)','mean_age')
      self.Initials_list = self.Age_mean.select("Initial").rdd.flatMap(lambda x: x).collect()
      self.Mean_list = self.Age_mean.select("mean_age").rdd.flatMap(lambda x: x).collect()
      return preprocess_transform().setCenterObject(self)

In [11]:
'''
Custom Transformer class for tranformation implementation .

@param - 
       Transformer - Transformer class refrence 
       df - dataframe in which operation need to be carried ( passed through tranform function)
       A - A class for variable sharing.

@return -
       df - a dataframe which contains prediction value as well with featured value. 

'''

class preprocess_transform(Transformer,A):
  
    def _transform(self,df):
      print("********************************  in Transform method ...************************************")
      self = self.getCenterObject()
      
      
      """
      Generate feature column in dataframe based on specific logic

      @param - 
               df - dataframe for operation.

      @return - 
               df - dataframe with generated feature.
      """
      
#       def feature_generation(self,df):
#         print(self.df.show(2))
#         self.df = self.df.withColumn("Family_Size",col('SibSp')+col('Parch'))
#         self.df = self.df.withColumn('Alone',lit(0))
#         self.df = self.df.withColumn("Alone",when(self.df["Family_Size"] ==0, 1).otherwise(self.df["Alone"]))
#         return self.df
      
      
      def feature_generation(self,df):
        df = df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))
        df = df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
                        ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])
        df = df.withColumn("Family_Size",col('SibSp')+col('Parch'))
        df = df.withColumn('Alone',lit(0))
        df = df.withColumn("Alone",when(df["Family_Size"] ==0, 1).otherwise(df["Alone"]))
        return df


      """
      Impute Age based on Age mean of specific gender. ex for male mean is 46 update all null male row with 46, similarly for others

      @param - 
              df - dataframe for operation

      @return -
             df - with imputed value

      """
  
      def Age_impute(self,df):
        for i,j in zip(self.Initials_list,self.Mean_list):
            df = df.withColumn("Age",when((df["Initial"] == i) & (df["Age"].isNull()), j).otherwise(df["Age"]))

        return df
        
        
      """
      Impute Embark based on mode of embark column
      @param - 
              df - dataframe for operation

      @return -
             df - with imputed value

      """
      def Embark_impute(self,df):
        mode_value = df.groupBy('Embarked').count().sort(col('count').desc()).collect()[0][0]
        df = df.fillna({'Embarked':mode_value})
        return df
      
      
      """
      Impute Fare based on the class which he/she had sat ex: class 3rd has mean fare 9 and null fare belong to 3rd class so fill 9
      @param - 
              df - dataframe for operation

      @return -
             df - with imputed value

      """
      def Fare_impute(self,df):
        Select_pclass = df.filter(col('Fare').isNull()).select('Pclass')
        if Select_pclass.count() > 0:
          Pclass = Select_pclass.rdd.flatMap(lambda x: x).collect()
          for i in Pclass:
            mean_pclass_fare = df.groupBy('Pclass').mean().select('Pclass','avg(Fare)').filter(col('Pclass')== i).collect()[0][1]
            df = df.withColumn("Fare",when((col('Fare').isNull()) & (col('Pclass') == i),mean_pclass_fare).otherwise(col('Fare')))
        return df
      
      
      '''
      combining all column imputation together..

      @param - 
            df - a dataframe for operation.

      @return - 
            df - dataframe with imputed value.

      '''
      def all_impute_together(df):
        df = Age_impute(self,df)
        df = Embark_impute(self,df)
        df = Fare_impute(self,df)
        return df
      
      
      '''
      converting string to numeric values.

      @param - 
               df - dataframe contained all columns.
               col_list - list of column need to be 

      @return - 
              df - transformed dataframe.
      '''
      def stringToNumeric_conv(df,col_list):
        indexer = [StringIndexer(inputCol=column,outputCol=column+"_index").fit(df) for column in col_list]
        string_change_pipeline = Pipeline(stages=indexer)
        df = string_change_pipeline.fit(df).transform(df)
        return df

      
      """
      Drop column from dataframe
      @param -
             df - dataframe 
             col_name - name of column which need to be dropped.
      @return -
             df - a dataframe except dropped column
      """
      def drop_column(df,col_list):
        for i in col_list:
            df = df.drop(col(i))
        return df
      
      
      col_list = ["Sex","Embarked","Initial"]
      dataset = feature_generation(self,df)
      df_impute = all_impute_together(dataset)
      df_numeric = stringToNumeric_conv(df_impute,col_list)
      df_final = drop_column(df_numeric,['Cabin','Name','Ticket','Family_Size','SibSp','Parch','Sex','Embarked','Initial'])
      return df_final

In [12]:
# initialization for pipeline setup
my_model = My_preprocessing_Estimator()
feature = VectorAssembler(inputCols=['Pclass','Age','Fare','Alone','Sex_index','Embarked_index','Initial_index'],outputCol="features")
lr = LogisticRegression(labelCol='Survived',featuresCol='features')


'''
pipeline stages initilization , fit and transform.
'''
pipeline = Pipeline(stages=[my_model,feature,lr])
model = pipeline.fit(df)
prediction = model.transform(df_test)

In [13]:
prediction.show()

In [14]:
"""
Generate feature column in dataframe based on specific logic

@param - 
         df - dataframe for operation.
         
@return - 
         df - dataframe with generated feature.
"""

def feature_generation(df):
  df = df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))
  df = df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])
  
  df = df.withColumn("Family_Size",col('SibSp')+col('Parch'))
  df = df.withColumn('Alone',lit(0))
  df = df.withColumn("Alone",when(df["Family_Size"] ==0, 1).otherwise(df["Alone"]))
  
  return df

In [15]:
"""
Impute Age based on Age mean of specific gender. ex for male mean is 46 update all null male row with 46, similarly for others

@param - 
        df - dataframe for operation

@return -
       df - with imputed value

"""

def Age_impute(df):
  Age_mean = df.groupBy("Initial").avg('Age')
  Age_mean = Age_mean.withColumnRenamed('avg(Age)','mean_age')
  Initials_list = Age_mean.select("Initial").rdd.flatMap(lambda x: x).collect()
  Mean_list = Age_mean.select("mean_age").rdd.flatMap(lambda x: x).collect()
  
  for i,j in zip(Initials_list,Mean_list):
      df = df.withColumn("Age",when((df["Initial"] == i) & (df["Age"].isNull()), j).otherwise(df["Age"]))
      
  return df

In [16]:
"""
Impute Embark based on mode of embark column
@param - 
        df - dataframe for operation

@return -
       df - with imputed value

"""

def Embark_impute(df):
  mode_value = df.groupBy('Embarked').count().sort(col('count').desc()).collect()[0][0]
  df = df.fillna({'Embarked':mode_value})
  return df

In [17]:
"""
Impute Fare based on the class which he/she had sat ex: class 3rd has mean fare 9 and null fare belong to 3rd class so fill 9
@param - 
        df - dataframe for operation

@return -
       df - with imputed value

"""

def Fare_impute(df):
  Select_pclass = df.filter(col('Fare').isNull()).select('Pclass')
  if Select_pclass.count() > 0:
    Pclass = Select_pclass.rdd.flatMap(lambda x: x).collect()
    for i in Pclass:
      mean_pclass_fare = df.groupBy('Pclass').mean().select('Pclass','avg(Fare)').filter(col('Pclass')== i).collect()[0][1]
      df = df.withColumn("Fare",when((col('Fare').isNull()) & (col('Pclass') == i),mean_pclass_fare).otherwise(col('Fare')))
  return df
    

In [18]:
'''
combining all column imputation together..

@param - 
      df - a dataframe for operation.

@return - 
      df - dataframe with imputed value.
 
'''

def all_impute_together(df):
  df = Age_impute(df)
  df = Embark_impute(df)
  df = Fare_impute(df)
  return df

In [19]:
"""
Drop column from dataframe
@param -
       df - dataframe 
       col_name - name of column which need to be dropped.
@return -
       df - a dataframe except dropped column
"""

def drop_column(df,col_list):
  for i in col_list:
      df = df.drop(col(i))
  return df

In [20]:
'''
converting string to numeric values.

@param - 
         df - dataframe contained all columns.
         col_list - list of column need to be 
         
@return - 
        df - transformed dataframe.
'''
def stringToNumeric_conv(df,col_list):
    indexer = [StringIndexer(inputCol=column,outputCol=column+"_index").fit(df) for column in col_list]
    string_change_pipeline = Pipeline(stages=indexer)
    df = string_change_pipeline.fit(df).transform(df)
    df = drop_column(df,col_list)
    return df

In [21]:
lr_prediction.select("features","rawPrediction","probability","prediction","Survived")
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

In [22]:
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))