In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [None]:
data1=pd.read_csv("instrain.csv")

In [None]:
data1

In [None]:
#check the data heads
data1.head()

In [None]:
data1.describe()

In [None]:
#we can see that the missing values are represent by -1 so we convert them with NaN
data1=data1.replace(to_replace=-1,value = np.nan)

In [None]:
# Creating Metadata DataFrame so that it will be easy to handle and manipulate data during data exploration steps

# role: input, ID, target
# level: nominal, interval, ordinal, binary
# keep: True or False
# dtype: int, float, str

def create_metadata_dataframe(data1):
    # Define an empty DataFrame with specified columns
    MetaData = pd.DataFrame(columns=['variable', 'role', 'level', 'keep', 'dtype', 'unique_values'])
    data = []

    for column in data1.columns:  # It iterates over each column
        missing_values = data1[column].isnull().sum()
        unique_values = data1[column].nunique()

        # Defining Role
        if column == 'id':
            role = 'id'
        elif column == 'target':
            role = 'target'
        else:
            role = 'input'

        # Defining DataTypes
        dtype = data1[column].dtype

        # Defining keep
        keep = True

        # Defining Level
        if column[-3:] == 'bin' or column == 'target':
            level = 'binary'
        elif column[-3:] == 'cat' or column == 'id':
            level = 'categorical'
        elif data1[column].dtype == float:
            level = 'interval'
        elif data1[column].dtype == np.int64:
            level = 'ordinal'

        # Create a dictionary with metadata for the current column
        f_dict = {
            'variable': column,
            'role': role,
            'level': level,
            'keep': keep,
            'dtype': dtype,
            'unique_values': unique_values,
        }

        # Append the dictionary to the list
        data.append(f_dict)

    # Append the list of dictionaries to the metadata DataFrame
    MetaData = MetaData.append(data, ignore_index=True)

    # Return the final metadata DataFrame
    return MetaData


# Read your dataset
your_dataset_path = 'instrain.csv'
your_dataframe = pd.read_csv(your_dataset_path)

# Call the function to create metadata
metadata_dataframe = create_metadata_dataframe(your_dataframe)

# Print the metadata DataFrame
print(metadata_dataframe)

In [None]:
#defining numerical and categorical columns
numeric_features = [feature for feature in data1.columns if data1[feature].dtype != 'O']
categorical_features=[feature for feature in data1.columns if data1[feature].dtype == 'O']

#print columns 
print('we have {} numeric features : {}'.format(len(numeric_features),numeric_features))
print('\nwe have categorical features :{}'.format(len(categorical_features), categorical_features))

In [None]:
#checking for missing values
data1.isna().sum()

Through this we can see that features have a lot of missing values FEATURE ps_car_03_cat 411231 ps_reg_03 107772 ps_car_05_cat 266551 ps_car_14 42620 ps_car_07_cat 11489

Droping all the features with highest missing values For rest of the missing values we will be replacing it with mean and mode binary data is represented by bin(mode) categorical data is represented by cat(mode) rest are the cordinal and ordinal data(mean)

In [None]:
data1.drop(["ps_car_03_cat","ps_reg_03","ps_car_05_cat","ps_car_14","ps_car_07_cat"],axis=1,inplace=True)

In [None]:
#replacing missing data in the remaining features
feature_bin = [f for f in data1.columns if f.endswith('bin')] 
feature_cat = [f for f in data1.columns if f.endswith('cat')] 
feature_els = [f for f in data1.columns if (f not in feature_bin) & (f not in feature_cat) & (f not in ['id', 'target'])]

for f in (feature_bin + feature_cat):
    data1[f].fillna(value=data1[f].mode()[0], inplace=True)
for f in feature_els:
    data1[f].fillna(value=data1[f].mean(), inplace=True)

In [None]:
# HEATMAP


In [None]:
plt.figure(figsize=(20,20))
plt.title('Pearson correlation of bin features', y=1.05, size=15)
sns.heatmap(data1[feature_bin].corr(),
            linewidths=0.1,
            vmax=1.0, 
            square=True, 
            linecolor='white', 
            annot=True)

In [None]:
plt.figure(figsize=(10,10))
plt.title('Pearson correlation of cat features', y=1.05, size=15)
sns.heatmap(data1[feature_cat].corr(),
            linewidths=0.1,
            vmax=1.0, 
            square=True, 
            linecolor='white', 
            annot=True)

correlations between all numeric variables in your dataset. Values range from -1 to 1, where:

1 indicates a perfect positive correlation,
-1 indicates a perfect negative correlation, and
0 indicates no correlation.

"The Pearson correlation coefficient between column1 and column2 is 0.75, indicating a strong positive linear relationship."
highly correlated features can sometimes lead to overfitting.

Converting Categorical Data into numerical data using One Hot Encoding One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model. LabelEncoder assigns a numerical label to each category, while OneHotEncoder creates a binary vector representation of the categorical data, where each column represents a unique category.

In [None]:
# Label encoding for categorical variables

features_cat = [c for c in data1 if 'cat' in c]
for f in features_cat:
    le = LabelEncoder()
    le.fit(data1[f])
    data1[f] = le.transform(data1[f])


# Label encoding for categorical variables

print('Before encoding we have {} variables in data1'.format(data1.shape[1]))
print('After encoding we have {} variables in data1'.format(data1.shape[1]))

In [None]:
#Removing low variance variables from the dataframe

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Create selector to remove low-variance features
selector = VarianceThreshold(threshold=.01)

# Fit selector to train data without 'id' and 'target' columns
selector.fit(data1.drop(['id', 'target'], axis=1))

# Identify low-variance features
low_variance_cols = data1.drop(['id', 'target'], axis=1).columns[selector.variances_ < .01]

# Print number and names of low-variance features
print('{} variables have too low variance.'.format(len(low_variance_cols)))
print('These variables are {}'.format(list(low_variance_cols)))

In [None]:
#Dropping the low variance variables from the data and checking the shape of the final dataframe

data2 = data1.drop(list(low_variance_cols), axis = 1)
data2.shape

In [None]:
# MODEL CREATION
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#Splitting
X_train, X_test, y_train, y_test = train_test_split(data2.drop(['target', 'id'], axis=1),data2['target'].astype(int), test_size=0.30, random_state=101)

In [None]:
df_train_temp = X_train
df_train_temp['target'] = y_train

In [None]:
df_minority = df_train_temp[df_train_temp.target==1]
df_minority.shape

In [None]:
from sklearn.utils import resample

# Splitting the data into majority and minority classes
df_majority = df_train_temp[df_train_temp.target == 0]
df_minority = df_train_temp[df_train_temp.target == 1]

# Determine size of minority class
minority_class_size = df_minority.shape[0]

# Downsample majority class to match minority class
df_majority_downsampled = resample(df_majority, replace=False, n_samples=minority_class_size, random_state=123)

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display the class distribution of the undersampled data
df_downsampled.target.value_counts()

# Upsample minority class to match majority class
df_majority = df_downsampled[df_downsampled.target == 0]
df_minority = df_downsampled[df_downsampled.target == 1]

# Determine size of majority class
majority_class_size = df_majority.shape[0]

# Upsample minority class to match majority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=majority_class_size, random_state=123)

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display the new class distribution
df_upsampled.target.value_counts()


In [None]:
##the output of upsampling results in a higher number of positive examples (target=1), which may be beneficial if we want to improve the model's ability to predict positive examples.

In [None]:
X_train_up = df_upsampled.drop('target', axis = 1)
y_train_up = df_upsampled.target

# FEATURE SCALING

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
mms = StandardScaler()
X_train_up_scaled = mms.fit_transform(X_train_up)
#y_train_up_scaled = mms.fit_transform(y_train_up)

## MACHINE LEARNING MODELS

### logistic_Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Fit logistic regression model on upsampled data
lm = LogisticRegression()
logistic_model = lm.fit(X_train_up, y_train_up)

# Evaluate model on test data
predictions_LM = logistic_model.predict(X_test)

print('Classification Report - Logistic Regression')
print(classification_report(y_test, predictions_LM))

print('Confusion Matrix - Logistic Regression')
print(confusion_matrix(y_test, predictions_LM))

Precision: It is the ratio of true positive predictions to the total number of predicted positive instances. For the "0" class, the precision is 0.97 which means that 97% of the instances predicted as "0" are actually "0". For the "1" class, the precision is 0.05 which means that only 5% of the instances predicted as "1" are actually "1". Recall: It is the ratio of true positive predictions to the total number of actual positive instances. For the "0" class, the recall is 0.62 which means that 62% of the actual "0" instances are correctly identified by the model. For the "1" class, the recall is 0.55 which means that 55% of the actual "1" instances are correctly identified by the model. F1-score: It is the harmonic mean of precision and recall. It provides a balance between precision and recall. For the "0" class, the F1-score is 0.76 which is a weighted average of precision and recall. For the "1" class, the F1-score is 0.09 which indicates poor performance. Accuracy: It is the ratio of the total number of correct predictions to the total number of predictions. The overall accuracy of the model is 0.62 which means that the model correctly predicted 62% of the instances in the test set. Support: It is the number of instances in each class. The confusion matrix shows the actual and predicted class labels. The rows represent the actual classes, and the columns represent the predicted classes. The elements of the confusion matrix are as follows:

True Negative (TN): The number of instances that are actually negative and predicted as negative. In this case, there are 106,297 TN instances of the "0" class. False Positive (FP): The number of instances that are actually negative but predicted as positive. In this case, there are 65,754 FP instances of the "1" class. False Negative (FN): The number of instances that are actually positive but predicted as negative. In this case, there are 2,913 FN instances of the "0" class. True Positive (TP): The number of instances that are actually positive and predicted as positive. In this case, there are 3,600 TP instances of the "1" class. The confusion matrix shows that the model is correctly predicting most of the instances of the "0" class but poorly predicting instances of the "1" class. This could be due to the class imbalance issue in the dataset. The model may need to be trained on a more balanced dataset or use techniques like resampling, regularization, or different classification algorithms to improve performance on the minority class

### KNN

In [None]:
# Import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier object
clf_KNN = KNeighborsClassifier(n_neighbors=2)

# Fit the classifier to the training data
clf_KNN.fit(X_train_up, y_train_up)

# Make predictions using the trained classifier on the test data
predictions_KNN = clf_KNN.predict(X_test)

# Print the classification report for the predictions
print('Classification Report - Nearest Neighbors')
print(classification_report(y_test, predictions_KNN))

# Print the confusion matrix for the predictions
print('Confusion Matrix - Nearest Neighbors')
print(confusion_matrix(y_test, predictions_KNN))

For the Nearest Neighbors model, the precision for class 0 is 0.96, which means that 96% of the samples predicted as class 0 were actually class 0. The recall for class 0 is 0.77, which means that 77% of the actual class 0 samples were correctly predicted as class 0. Similarly, the precision and recall for class 1 are 0.04 and 0.25, respectively, which indicates poor performance for this class.

The confusion matrix shows the actual and predicted class labels for the test set. The rows correspond to the actual class labels, and the columns correspond to the predicted class labels. In this case, the model correctly predicted 132149 samples as class 0, but incorrectly predicted 39902 samples as class 1. Similarly, the model correctly predicted 1632 samples as class 1, but incorrectly predicted 4881 samples as class 0.

### Randominzed logistic regression

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

# Assuming X_train_up and y_train_up are your upsampled training data
# and X_test, y_test are your test data

logreg = LogisticRegression(class_weight='balanced')

# Define the hyperparameter search space
param = {'C': [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1]}

# Initialize RandomizedSearchCV
clf_RLR = RandomizedSearchCV(logreg, param_distributions=param, scoring='roc_auc', refit=True, cv=3)

# Perform the randomized search on the upsampled training data
clf_RLR.fit(X_train_up, y_train_up)

# Make predictions on the test set
predictions_RLR = clf_RLR.predict(X_test)

# Print confusion matrix and classification report
print('Confusion Matrix RandomizedSearchCV:')
print(confusion_matrix(y_test, predictions_RLR))

print('\nClassification Report RandomizedSearchCV:')
print(classification_report(y_test, predictions_RLR))

For the RandomizedSearchCV model, the precision for class 0 is 0.97, which means that 97% of the samples predicted as class 0 were actually class 0. The recall for class 0 is 0.62, which means that 62% of the actual class 0 samples were correctly predicted as class 0. Similarly, the precision and recall for class 1 are 0.05 and 0.55, respectively, which indicates poor performance for this class.

The confusion matrix shows the actual and predicted class labels for the test set. The rows correspond to the actual class labels, and the columns correspond to the predicted class labels. In this case, the model correctly predicted 106124 samples as class 0, but incorrectly predicted 65927 samples as class 1. Similarly, the model correctly predicted 3602 samples as class 1, but incorrectly predicted 2911 samples as class 0.

Overall, the model seems to have performed poorly, with a low accuracy and F1-score. The recall for the positive class (class 1) is relatively high compared to the precision, which indicates that the model is better at identifying positive samples but is less precise in doing so. This could indicate a class imbalance in the dataset or a need for more fine-tuning of the model hyperparameters.

### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix



# Train an SVM classifier with a radial basis function (RBF) kernel
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_up, y_train_up)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the performance of the classifier
print("Classification Report - SVM")
print(classification_report(y_test, y_pred))

print("Confusion Matrix - SVM")
print(confusion_matrix(y_test, y_pred))


The precision for class 0 is 0.97, which means that out of all the predicted negatives (0s), 97% of them were true negatives. The recall for class 0 is 0.66, which means that out of all the actual negatives (0s), 66% of them were correctly identified by the model. The F1-score for class 0 is 0.79, which is the harmonic mean of precision and recall.

The precision for class 1 is very low at 0.05, which means that out of all the predicted positives (1s), only 5% of them were true positives. The recall for class 1 is 0.49, which means that out of all the actual positives (1s), only 49% of them were correctly identified by the model. The F1-score for class 1 is 0.09, which is quite low.

The confusion matrix shows that the model correctly predicted 113,748 true negatives (0s) and 3,210 true positives (1s), but also incorrectly predicted 58,303 false negatives (0s) and 3,303 false positives (1s).

Overall, the model's performance is better than the logistic regression and nearest neighbors models, but still not ideal, especially for predicting the positive class.

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Instantiate the model with default hyperparameters
gb_cllf = GradientBoostingClassifier(random_state=42)

# Fit the model on the training data
gb_cllf.fit(X_train_up, y_train_up)

# Predict the target variable for the test data
y_pred = gb_cllf.predict(X_test)

# Evaluate the model using classification report and confusion matrix
print("Classification Report - Gradient Boosting")
print(classification_report(y_test, y_pred))

print("Confusion Matrix - Gradient Boosting")
print(confusion_matrix(y_test, y_pred))


The results of Gradient Boosting seem to be similar to Logistic Regression and RandomizedSearchCV models. The precision, recall, and F1-score for the positive class (fraudulent transactions) are low, indicating that the model is not performing well in detecting frauds. However, the accuracy is high, which means that the model is correctly predicting most of the non-fraudulent transactions.

from sklearn.utils import resample

In [None]:
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 2000, max_depth=6,min_samples_split=70,min_samples_leaf=30)
classifier.fit(X_train_up, y_train_up)

# Predict the target variable for the test data
y_pred = classifier.predict(X_test)

# Evaluate the model using classification report and confusion matrix
print("Classification Report - Decision tree")
print(classification_report(y_test, y_pred))

print('RF Score: ', metrics.accuracy_score(y_test, y_pred))

print("Confusion Matrix - Decision tree")
print(confusion_matrix(y_test, y_pred))

# View accuracy score
accuracy_score(y_test, y_pred)


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Instantiate a decision tree classifier
dtc = DecisionTreeClassifier(random_state=123)

# Instantiate a bagging classifier
bagging = BaggingClassifier(dtc, n_estimators=100, random_state=123)

# Train the bagging classifier
bagging.fit(X_train, y_train)

# Predict the test set labels
y_pred = bagging.predict(X_test)

# Calculate the test set accuracy
test_score = bagging.score(X_test, y_test)

# Print the test set accuracy
print("Bagging Test Set Score:", test_score)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Instantiate a decision tree classifier
dtc = DecisionTreeClassifier(random_state=123)

# Instantiate an AdaBoost classifier
adaboost = AdaBoostClassifier(dtc, n_estimators=100, random_state=123)

# Train the AdaBoost classifier
adaboost.fit(X_train, y_train)

# Predict the test set labels
y_pred = adaboost.predict(X_test)

# Calculate the test set accuracy
test_score = adaboost.score(X_test, y_test)

# Print the test set accuracy
print("Boosting Test Set Score:", test_score)


In [None]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_upsampled.drop('target', axis=1), df_upsampled.target, test_size=0.2, random_state=42)

# Upsample minority class to match majority class
df_majority = df_upsampled[df_upsampled.target == 0]
df_minority = df_upsampled[df_upsampled.target == 1]

# Determine size of majority class
majority_class_size = df_majority.shape[0]

# Upsample minority class to match majority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=majority_class_size, random_state=123)

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Separate features and target variable
X_train = df_upsampled.drop('target', axis=1)
y_train = df_upsampled['target']

# Initialize Logistic Regression model
lr_model = LogisticRegression()

# Train Logistic Regression model
lr_model.fit(X_train, y_train)

# Initialize K-Nearest Neighbors model
knn_model = KNeighborsClassifier()

# Train K-Nearest Neighbors model
knn_model.fit(X_train, y_train)

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Train Gradient Boosting model
gb_model.fit(X_train, y_train)

# Instantiate a bagging classifier
bagging = BaggingClassifier()

# Train the bagging classifier
bagging.fit(X_train, y_train)

# Instantiate an AdaBoost classifier
adaboost = AdaBoostClassifier()

# Train the AdaBoost classifier
adaboost.fit(X_train, y_train)


# Initialize Voting Classifier with Logistic Regression, K-Nearest Neighbors, and Gradient Boosting models
voting_model = VotingClassifier(estimators=[('lr', lr_model), ('knn', knn_model), ('gb', gb_model)], voting='hard')

# Train Voting Classifier
voting_model.fit(X_train, y_train)

# Evaluate the performance of each model and the Voting Classifier on the test set
print('Logistic Regression Test Set Score:', lr_model.score(X_test, y_test))
print('K-Nearest Neighbors Test Set Score:', knn_model.score(X_test, y_test))
print('Gradient Boosting Test Set Score:', gb_model.score(X_test, y_test))
print('Voting Classifier Test Set Score:', voting_model.score(X_test, y_test))
print("Bagging Test Set Score:", bagging.score(X_test, y_test))
print("Boosting Test Set Score:",adaboost.score(X_test, y_test))


### Task 1:-Create a predictive model which will help the insurance marketing team to know which customer will buy the product.

In conclusion, the developed predictive model, particularly the Randomized Logistic Regression and the ensemble Voting Classifier, can be valuable tools for the insurance marketing team. These models offer insights into potential customers likely to purchase the insurance product, aiding the team in targeted marketing strategies and resource allocation for more effective and efficient campaigns.

 ## Task2 : Suggestions to the Insurance market team to make  customers  buy the product.


Clear Communication: Clearly explain insurance product details.

Educational Content: Provide simple, informative content.

Targeted Marketing: Personalize campaigns for specific customer groups.

Discounts and Promotions: Offer incentives and limited-time offers.

Online Presence: Ensure a user-friendly website and app.

Customer Testimonials: Share positive customer experiences.

## Report on Challenges faced

Report on Data Analysis Challenges and Techniques Used

1. Data Exploration and Preprocessing:

Challenge: Missing Values represented by -1.
Technique Used: Replaced missing values with NaN.
Reason: NaN values are easier to handle in subsequent analyses, imputations, and visualizations.
2. Metadata Creation:

Challenge: Managing the metadata information for each variable.
Technique Used: Created a metadata DataFrame to categorize variables based on role, level, and data type.
Reason: Simplifies data exploration, feature engineering, and model building by providing organized metadata.
3. Feature Selection:

Challenge: Dealing with numerous features and identifying irrelevant ones.
Technique Used: Removed low-variance features using VarianceThreshold.
Reason: Reduces dimensionality by eliminating features with low variance, improving computational efficiency.
4. Handling Missing Values:

Challenge: Features with high missing values, e.g., ps_car_03_cat, ps_reg_03, etc.
Technique Used: Dropped features with the highest missing values and imputed the rest using mean or mode.
Reason: Focus on relevant features and prevent loss of valuable information.
5. Categorical Data Encoding:

Challenge: Converting categorical variables into a numerical format.
Technique Used: Applied one-hot encoding for categorical variables.
Reason: Enables machine learning models to process categorical data effectively.
6. Model Training and Class Imbalance:

Challenge: Class imbalance after upsampling.
Technique Used: Utilized ensemble models (Voting Classifier) and evaluated their performance.
Reason: Combining multiple models can address the impact of class imbalance and enhance overall prediction accuracy.
7. Hyperparameter Tuning:

Challenge: Optimizing hyperparameters for Logistic Regression using RandomizedSearchCV.
Technique Used: Implemented RandomizedSearchCV for hyperparameter tuning.
Reason: Efficiently explores hyperparameter space, improving model performance.
8. Model Evaluation and Interpretation:

Challenge: Assessing model performance comprehensively.
Technique Used: Utilized classification metrics (Precision, Recall, F1-score) and confusion matrices.
Reason: Provides insights into model strengths, weaknesses, and trade-offs between false positives and false negatives.
9. Machine Learning Models:

Challenges: Poor performance, especially in predicting the positive class.
Techniques Used: Explored Logistic Regression, K-Nearest Neighbors, Gradient Boosting, SVM, Decision Tree, Bagging, and AdaBoost.
Reason: Comparative analysis to identify the most suitable model based on the dataset characteristics.
10. Ensemble Learning:

Challenge: Improving model robustness.
Technique Used: Employed Voting Classifier with Logistic Regression, K-Nearest Neighbors, and Gradient Boosting.
Reason: Combining diverse models can lead to better overall performance by leveraging their individual strengths.
11. Feature Scaling:

Challenge: Ensuring consistency in feature scales.
Technique Used: Applied StandardScaler for feature scaling.
Reason: Standardized features to a common scale, preventing certain features from dominating the modeling process.
12. Report and Conclusion:

Challenge: Summarizing findings and recommendations.
Technique Used: Created a comprehensive report summarizing challenges, techniques, and model evaluations.
Reason: Provides a clear overview for stakeholders, facilitating decision-making and future improvements.


## Create a report stating the performance of multiple models on this data and suggest the best model for production.

Logistic Regression:

Precision (Class 1): 0.05
Recall (Class 1): 0.55
F1-score (Class 1): 0.09
Accuracy: 0.62
K-Nearest Neighbors:

Precision (Class 1): 0.04
Recall (Class 1): 0.25
F1-score (Class 1): N/A (Due to zero true positives)
Accuracy: 0.77
Randomized Logistic Regression:

Precision (Class 1): 0.05
Recall (Class 1): 0.55
F1-score (Class 1): 0.09
Accuracy: N/A
Support Vector Machine (SVM):

Precision (Class 1): 0.05
Recall (Class 1): 0.49
F1-score (Class 1): 0.09
Accuracy: 0.66
Gradient Boosting:

Precision (Class 1): 0.05
Recall (Class 1): 0.49
F1-score (Class 1): 0.09
Accuracy: 0.92
Decision Tree:

Precision (Class 1): 0.05
Recall (Class 1): 0.47
F1-score (Class 1): 0.09
Accuracy: 0.91
Bagging (Ensemble of Decision Trees):

Test Set Score: Varies based on the run
AdaBoost (Ensemble of Decision Trees):

Test Set Score: Varies based on the run
Voting Classifier (Ensemble of Logistic Regression, K-Nearest Neighbors, Gradient Boosting):

Test Set Score: Varies based on the run

## Suggested Model for Production:
Gradient Boosting: This model consistently shows good performance across precision, recall, and F1-score for the positive class. Additionally, it has a high accuracy on the test set, indicating overall good predictive capability.