In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()
from sklearn.linear_model import LinearRegression
import numpy as np
from matplotlib import pyplot as plt

In [0]:
import warnings 
warnings.filterwarnings("ignore")
import pandas as pd

In [0]:
# file_path = 'dbfs:/Volumes/workspace/default/mydata_file/Loan_prediction_dataset.csv'
dbutils.fs.ls('dbfs:/Volumes/workspace/default/mydata_file/Loan_prediction_dataset.csv')

In [0]:
df = spark.read.csv('dbfs:/Volumes/workspace/default/mydata_file/Loan_prediction_dataset.csv', header=True, inferSchema=True)
display(df)

In [0]:
%sql

-- # df.write.saveAsTable('LoanPrediction')
Select * from LoanPrediction where Loan_Status ='Y'

In [0]:
# df.printSchema()
df.dtypes

In [0]:
#display count based on loan status 
df.groupBy('Loan_Status').count().show()

In [0]:
df_avg_credit = df.groupBy('Loan_Status').avg('Credit_History').alias('avg_credit')
df_avg_credit.show(5)

In [0]:
df.groupBy('Loan_Status', 'Gender').count().filter('Loan_Status = "Y"') \
                                           .filter(df['Loan_Status'].isNotNull()) \
                                           .show()

In [0]:
df.dtypes

In [0]:
# Creating correlation Matrix 
Columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
Correlation_df = pd.DataFrame()
for i in Columns:
    Corr = []
    for j in Columns:
        Corr.append(round(df.stat.corr(i, j), 2))
    Correlation_df = pd.concat([Correlation_df,pd.Series(Corr)], axis=1)
Correlation_df.columns = Columns
Correlation_df.index= Columns
Correlation_df



In [0]:
display(df.dtypes)

In [0]:
numerical_cols = ['LoanAmount','Loan_Amount_Term','Credit_History']
categorical_cols = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']


In [0]:
for col in numerical_cols:
    mean = df.select(F.mean(df[col])).collect()[0][0]
    df = df.na.fill(mean, [col])


display(df)

In [0]:
for col in categorical_cols:
    mode = df.groupBy(col).count().orderBy('count', ascending =False).first()[0]
    # display(mode)
df= df.na.fill(mode, [col])
display(df)
                       

In [0]:
df = df.withColumn('Total_income', F.col('ApplicantIncome') + F.col('CoapplicantIncome'))
df.show(5)

In [0]:
df.printSchema()

In [0]:
df = df.withColumn('Loan_Status', df['Loan_Status'].cast("string"))

In [0]:
# converting Loan_Status column to binary values 
df = df.withColumn('Loan_Status', F.when(df['Loan_Status'] == 'Y', 1).otherwise(0))
display(df)


In [0]:
# traning set with linear regression
pandas_df = df.toPandas()

# Training set with linear regression
Test_cols = pandas_df['Total_income']
X = np.asarray(Test_cols).reshape(-1, 1)
y = np.asarray(pandas_df['Loan_Status'])
Lin_RModel = LinearRegression()
Lin_RModel.fit(X, y)

theta0 = Lin_RModel.intercept_
theta1 = Lin_RModel.coef_[0]
print(f"Hypothesis: y = {theta0:.0f} + {theta1:.0f} * x")

In [0]:
# Predict values using the model
y_pred = Lin_RModel.predict(X)
display(y_pred)

In [0]:
# Plot original data and hypothesis line
plt.scatter(X, y, color='blue', label='Actual data')
plt.plot(X, y_pred, color='red', label='Hypothesis (prediction)')
plt.xlabel('ApplicantIncome')
plt.ylabel('Loan_Status')
plt.title('Linear Regression Hypothesis')
plt.legend()
plt.grid(True)
plt.show()