In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [0]:
# File location and type
file_location = "/FileStore/tables/test1.csv"
file_location = "/FileStore/tables/tips.csv"
file_location = "/FileStore/tables/hcvdata.csv"

file_type = "csv"

df=spark.read.csv(file_location, header=True, inferSchema=True)

In [0]:
df.show()

In [0]:
# check for columns
print(df.columns)

In [0]:
# Rearrange
df = df.select('Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT','Category')

In [0]:
df.show(5)

In [0]:
df.dtypes

In [0]:
# Check for the Schema
df.printSchema()


In [0]:
# Descriptive summary
print(df.describe().show())


In [0]:
# Value Count 
df.groupBy('Category').count().show()

In [0]:
import pyspark.ml

In [0]:
dir(pyspark.ml)


In [0]:
# Load ML Pkgs
from pyspark.ml.feature import VectorAssembler,StringIndexer


In [0]:

# Unique Values for Sex
df.select('Sex').distinct().show()

In [0]:
# Convert the string into numerical code
# label encoding
genderEncoder = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df)

In [0]:
df = genderEncoder.transform(df)

In [0]:
df.show(5)

In [0]:
# Encoding for Category
# Label Encoding
catEncoder = StringIndexer(inputCol='Category',outputCol='Target').fit(df)
df = catEncoder.transform(df) # catergorical encoding

In [0]:
df.show(5)

In [0]:
# Get the labels
catEncoder.labels

In [0]:
# IndexToString
from pyspark.ml.feature import IndexToString

In [0]:
converter = IndexToString(inputCol='Target',outputCol='orig_cat')


In [0]:
converted_df = converter.transform(df)


In [0]:
converted_df.show()


In [0]:
### Feature 
df.show()

In [0]:
print(df.columns)

In [0]:
df.dtypes

In [0]:
df2 = df.select('Age','Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Target')

In [0]:
df2.printSchema()


In [0]:
df2 = df2.toPandas().replace('NA',0).astype(float)

In [0]:
type(df2)

In [0]:
type(df)


In [0]:
# Convert To PySpark Dataframe
new_df = spark.createDataFrame(df2)


In [0]:
new_df.show()

In [0]:
# Check For DTYpes and Schema
new_df.printSchema()

In [0]:
required_features = ['Age','Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Target']

In [0]:
# VectorAsm
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='features')

In [0]:
vec_df = vec_assembler.transform(new_df)

In [0]:
vec_df.show(5)


In [0]:
train_df,test_df = vec_df.randomSplit([0.7,0.3]) #equivalent to train_test_split

In [0]:
train_df.count()

In [0]:
train_df.show(4)

In [0]:
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier

In [0]:
# Logist Model
lr = LogisticRegression(featuresCol='features',labelCol='Target')


In [0]:
lr_model = lr.fit(train_df)

In [0]:
y_pred = lr_model.transform(test_df)

In [0]:
y_pred.show()

In [0]:
print(y_pred.columns)

In [0]:
y_pred.select('target','rawPrediction', 'probability', 'prediction').show()

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Model evaluation

In [0]:
# How to Check For Accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol='Target',metricName='accuracy')

In [0]:
multi_evaluator.evaluate(y_pred)

In [0]:
# Precision,F1 Score,Recall : Classification Report


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics


In [0]:

lr_metric = MulticlassMetrics(y_pred['target', 'prediction'].rdd)

In [0]:
dir(lr_metric)

In [0]:
print("Accuracy",lr_metric.accuracy)


In [0]:
print("Precision",lr_metric.precision(1.0))
print("Recall",lr_metric.recall(1.0))
print("F1Score",lr_metric.fMeasure(1.0))

In [0]:
dir(lr_model)

In [0]:
# Saving Model
#lr_model.save("lr_model_30")

lr_model.write().save("/FileStore/tables/mylr_model")