In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.metrics import accuracy_score

# Load your dataset and define the target variable
df = pd.read_csv(r'C:\Users\Tombra\deployment\deployment\data.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'])
y = df['Loan_Status']

# Separate numerical and categorical columns
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Convert numerical columns to numeric data type
X[numerical_columns] = X[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Define the preprocessing steps for numerical and categorical data
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_columns),
        ('categorical', categorical_preprocessor, categorical_columns)
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=3)),
    ('model', LogisticRegression())
])

# Fit the pipeline on the entire dataset
pipeline.fit(X, y)

# Make predictions on the test set
y_pred = pipeline.predict(X)

# Calculate accuracy
acc = accuracy_score(y, y_pred)
print(f'Train set accuracy: {acc}')


Train set accuracy: 0.6905537459283387


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Load your dataset and define the target variable
df = pd.read_csv(r'C:\Users\Tombra\deployment\deployment\data.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'])
y = df['Loan_Status']

# Separate numerical and categorical columns
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Convert numerical columns to numeric data type
X[numerical_columns] = X[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Define the preprocessing steps for numerical and categorical data
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_columns),
        ('categorical', categorical_preprocessor, categorical_columns)
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=3)),
    ('model', LogisticRegression())
])

# Fit the pipeline on the entire dataset
pipeline.fit(X, y)

# Make predictions on the test set
y_pred = pipeline.predict(X)

# Calculate accuracy
acc = accuracy_score(y, y_pred)



# Save the model to a file
filename = 'loan_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
X.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Load your dataset and define the target variable
df = pd.read_csv(r'C:\Users\Tombra\deployment\deployment\data.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'])
y = df['Loan_Status']

# Separate numerical and categorical columns
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Convert numerical columns to numeric data type
X[numerical_columns] = X[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Define the preprocessing steps for numerical and categorical data
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())  # Use MinMaxScaler to ensure non-negative values
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_columns),
        ('categorical', categorical_preprocessor, categorical_columns)
    ])

# Create the feature selector
feature_selector = SelectKBest(score_func=chi2, k=5)  # Select top 5 features based on chi-square test

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('pca', PCA(n_components=3)),
    ('model', LogisticRegression())
])

# Fit the pipeline on the entire dataset
pipeline.fit(X, y)

# Make predictions on the test set
y_pred = pipeline.predict(X)

# Calculate accuracy
acc = accuracy_score(y, y_pred)


# Save the model to a file
filename = 'loan_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

# Load your dataset and define the target variable
df = pd.read_csv(r'C:\Users\Tombra\deployment\deployment\data.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'])
y = df['Loan_Status']

# Separate numerical and categorical columns
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Convert numerical columns to numeric data type
X[numerical_columns] = X[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Define the preprocessing steps for numerical and categorical data
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_columns),
        ('categorical', categorical_preprocessor, categorical_columns)
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=3)),
    ('model', LogisticRegression())
])

# Define the hyperparameter grid
param_grid = {
    'pca__n_components': [2, 3, 4],
    'model__C': [0.1, 1.0, 10.0]
}

# Create the grid search object
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

# Fit the grid search on the entire dataset
grid_search.fit(X, y)

# Get the best estimator from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the training set using the best model
y_pred = best_model.predict(X)

# Calculate accuracy
acc = accuracy_score(y, y_pred)
print(f'Train set accuracy: {acc}')


Train set accuracy: 0.6889250814332247


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.metrics import accuracy_score

# Load your dataset and define the target variable
df = pd.read_csv(r'C:\Users\Tombra\deployment\deployment\data.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'])
y = df['Loan_Status']

# Separate numerical and categorical columns
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Convert numerical columns to numeric data type
X[numerical_columns] = X[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Define the preprocessing steps for numerical and categorical data
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_columns),
        ('categorical', categorical_preprocessor, categorical_columns)
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=3)),
    ('model', LogisticRegression())
])

# Fit the pipeline on the entire dataset
pipeline.fit(X, y)

# Make predictions on the test set
y_pred = pipeline.predict(X)

# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize grid search with logistic regression model
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# Train the model using grid search
grid_search.fit(X_train_pca, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_pca)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')
print("Best parameters found by grid search:")
print(grid_search.best_params_)



Test set accuracy: 0.6910569105691057
Best parameters found by grid search:
{'C': 0.1, 'solver': 'liblinear'}


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pandas as pd

# Load your dataset and define the target variable
df = pd.read_csv(r'C:\Users\Tombra\deployment\deployment\data.csv')
X = df.drop(columns=['Loan_Status', 'Loan_ID'])
y = df['Loan_Status']

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Separate numerical and categorical columns
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Define the preprocessing steps for numerical and categorical data
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())  # Use MinMaxScaler to ensure non-negative values
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_preprocessor, numerical_columns),
        ('categorical', categorical_preprocessor, categorical_columns)
    ])

# Create the feature selector
feature_selector = Pipeline(steps=[
    ('selector', SelectKBest(score_func=chi2)),
    ('pca', PCA())
])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('model', RandomForestRegressor())
])

# Define the parameter grid for grid search
param_grid = {
    'feature_selector__selector__k': [3, 5],  # Vary the number of selected features for SelectKBest
    'feature_selector__pca__n_components': [1, 2, 3],  # Vary the number of components for PCA based on available features
    'model__n_estimators': [100, 200, 300],  # Vary the number of estimators
    'model__max_depth': [None, 5, 10],  # Vary the maximum depth
}

# Create the grid search object within the pipeline
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3)

# Fit the grid search on the entire dataset
grid_search.fit(X, y)

# Get the best model and its performance
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

# Print the best model and its score
print("Best Model:", best_model)
print("Best Score:", best_score)

Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['ApplicantIncome',
                                                   'CoapplicantIncome',
                                                   'LoanAmount',
                                                   'Loan_Amount_Term']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                  