## Marriage Trends in India

In [None]:
import pandas as pd
import numpy as np
from numpy.ma.core import arange
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('marriage_data_india.csv')
display(data.head())

In [None]:
display(data.isna().sum())

In [None]:
# List of categorical columns
cat_cols = data.select_dtypes(include=['object']).columns.tolist()
display("categorical_cols: \n", cat_cols)

# List of numerical columns
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
display("numerical_cols: \n", num_cols)

In [None]:
X = data.drop(columns=['ID','Divorce_Status'])
y = data['Divorce_Status']

X['Inter_Caste'] = X['Inter-Caste']
X['Inter_Religion'] = X['Inter-Religion']
X.drop(columns=['Inter-Caste','Inter-Religion'], inplace=True)

In [None]:
categorical_cols = [
    'Marriage_Type', 
    'Gender', 
    'Education_Level', 
    'Caste_Match', 
    'Religion', 
    'Parental_Approval', 
    'Urban_Rural', 
    'Dowry_Exchanged',
    'Marital_Satisfaction', 
    'Income_Level',
    'Spouse_Working', 
    'Inter_Caste', 
    'Inter_Religion']

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
display(X.head())

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
from imblearn.over_sampling import SMOTE
smote_obj = SMOTE()
X_new, y_new = smote_obj.fit_resample(X, y)

In [None]:
import statsmodels.api as sm

def backwardElimination(x, y, sl):
    numVars = len(x[0])
    indices = list(range(numVars))
    for i in range(0, numVars):
        obj_OLS = sm.OLS(y, x).fit()
        maxVar = max(obj_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, len(indices)):
                if (obj_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    indices.pop(j)
                    break
    display(obj_OLS.summary())
    return x, indices      

SL = 0.05
X_backe = np.append(arr=np.ones((len(X_new),1)), values=X_new, axis=1)
X_backe = X_backe.astype('float64')
X_sig = X_backe
X_Modeled, indices = backwardElimination(X_sig, y_new, SL)
display(X_Modeled)
display(indices)

In [None]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_new, y_new, test_size=.6, random_state=42, stratify=y_new)

In [None]:
scaler_cols = ['Age_at_Marriage', 'Children_Count', 'Years_Since_Marriage']
scaler = StandardScaler()

X_train_smote[scaler_cols] = scaler.fit_transform(X_train_smote[scaler_cols])
X_train_smote[scaler_cols] = scaler.transform(X_train_smote[scaler_cols])

display(X_train_smote.head())
display(X_train_smote.head())

In [None]:
display(X_train_smote)

In [None]:
#Fitting Logistic Regression to Training Set
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


classifierObj = LogisticRegression()
classifierObj.fit(X_train_smote, y_train_smote)
#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test_smote)
print(y_pred)
#Predicting probabilities
y_pred_prob = classifierObj.predict_proba(X_test_smote)
#Print Model Accuracy
print(classifierObj.score(X_test_smote,y_test_smote))

In [None]:
#Testing different test sizes vs. accuracy for Logistic Regression

accuracy_scores = []
for i in arange(0.1, 1.0, 0.1):
    X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_new, y_new, test_size=i, random_state=42, stratify=y_new)

    scaler_cols = ['Age_at_Marriage', 'Children_Count', 'Years_Since_Marriage']
    scaler = StandardScaler()

    X_train_smote[scaler_cols] = scaler.fit_transform(X_train_smote[scaler_cols])
    X_train_smote[scaler_cols] = scaler.transform(X_train_smote[scaler_cols])   

    classifierObj = LogisticRegression()
    classifierObj.fit(X_train_smote, y_train_smote)

    #Making predictions on the Test Set
    y_pred = classifierObj.predict(X_test_smote)
    #Print Model Accuracy
    accuracy_scores.append(classifierObj.score(X_test_smote,y_test_smote))
 
#graph i vs accuracy
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(arange(0.1, 1.0, 0.1), accuracy_scores, marker='o')
plt.title('Test Size vs Logistic Accuracy')
plt.xlabel('Test Size')
plt.ylabel('Logistic Accuracy')
plt.xticks(arange(0, 1, .1))
plt.yticks(arange(0, 1, .1))
plt.grid()

In [None]:
# #Applying PCA
# from sklearn.decomposition import PCA
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import confusion_matrix

# pcaObj = PCA(n_components=2)
# X_train_pca = pcaObj.fit_transform(X_new)
# X_test_pca = pcaObj.transform(X_test_smote)
# components_variance = pcaObj.explained_variance_ratio_
# print(components_variance)
# #Fitting Logistic Regression to Training Set
# classifierObj = LogisticRegression(random_state=0)
# classifierObj.fit(X_train_pca, y_train_smote)
# #Making predictions on the Test Set
# y_pred = classifierObj.predict(X_test_pca)
# #Evaluating the predictions using a Confusion Matrix
# cm = confusion_matrix(y_test_smote, y_pred)
# print(cm)


# # Visualizing the Training set results
# from matplotlib.colors import ListedColormap
# import matplotlib.pyplot as plt
#
# X_set, y_set = X_train_smote, y_train_smote
# X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
#                      np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
# cmap = ListedColormap(['red', 'blue', 'brown'])
# plt.contourf(X1, X2, classifierObj.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
#              alpha = 0.75, cmap = cmap)
# plt.xlim(X1.min(), X1.max())
# plt.ylim(X2.min(), X2.max())
# colors = ['red', 'blue', 'brown']
# for i, j in enumerate(np.unique(y_set)):
#     plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], color = colors[i],
#                 label = j, cmap = cmap)
# plt.title('Logistic Regression (Training set)')
# plt.xlabel('PC1')
# plt.ylabel('PC2')
# plt.legend()
# plt.show()


In [None]:
#Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

max_components = min(X_train_smote.shape[1], len(np.unique(y_train_smote)) - 1)
ldaObj = LDA(n_components=max_components)
X_train_lda = ldaObj.fit_transform(X_train_smote,y_train_smote)
X_test = ldaObj.transform(X_test_smote)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(X_train_lda, np.zeros_like(X_train_lda), c=y_train_smote, cmap='viridis', edgecolor='k')
plt.title("LDA Transformed Training Data")
plt.xlabel("LDA Component")
plt.ylabel("Zero Line")
plt.colorbar(label='Class Label')
plt.show()


In [None]:
from sklearn.decomposition import KernelPCA

# Performing Kernel PCA
kernel_pca = KernelPCA(n_components=25, kernel='rbf')
X_train_kernel_pca = kernel_pca.fit_transform(X_train_smote)
X_test_kernel_pca = kernel_pca.transform(X_test_smote)

# Scatter plot of Kernel PCA transformed data
plt.figure(figsize=(10, 6))
plt.scatter(X_train_kernel_pca[:, 3], X_train_kernel_pca[:, 4], c=y_train_smote, cmap='viridis', edgecolor='k')
plt.title("Kernel PCA Transformed Training Data")
plt.xlabel("Kernel PCA Component 1")
plt.ylabel("Kernel PCA Component 2")
plt.colorbar(label='Class Label')
plt.show()


In [None]:
from sklearn.metrics import classification_report

def modelclassificationreport(classifier, classifierName):
    classifier.fit(X_train_smote, y_train_smote)
    y_pred = classifier.predict(X_test_smote)
    print("Classification Report for Classifier: {}, className: {}".format(classifier, classifierName))
    report = classification_report(y_test_smote, y_pred)
    print(report)

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression

logistic_classifier = LogisticRegression(random_state=0)
modelclassificationreport(logistic_classifier, "Logistic Regression")

In [None]:
from sklearn.ensemble import RandomForestClassifier
modelclassificationreport(RandomForestClassifier(), "RandomForestClassifier")

In [None]:
from sklearn.svm import SVC
modelclassificationreport(SVC(kernel='rbf'), "RBF SVM")

In [None]:
from sklearn.svm import SVC
modelclassificationreport(SVC(kernel='poly'), "SVM with poly")

In [None]:
from sklearn.tree import DecisionTreeClassifier
modelclassificationreport(DecisionTreeClassifier(criterion="entropy"), "DecisionTreeClassifier")

In [None]:
from sklearn.naive_bayes import GaussianNB
modelclassificationreport(GaussianNB(), "GaussianNB")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics  import f1_score

macro_f1_scores = []
for k in range(1, 40):
    classifierObj = KNeighborsClassifier(n_neighbors=k)
    classifierObj.fit(X_train_smote, y_train_smote)
    y_pred = classifierObj.predict(X_test_smote)
    f1 = f1_score(y_test_smote, y_pred, average='macro')
    macro_f1_scores.append(f1)

plt.plot(range(1, 40, 1), macro_f1_scores, marker='o')
plt.xlabel('k')
plt.ylabel('F1 Score')
plt.title('F1 Score vs k')
plt.xticks(range(1, 40, 2))
plt.grid(True)
plt.show()

best_k_index = np.argmax(macro_f1_scores) + 1
print(f"Best k: {best_k_index} at {macro_f1_scores[best_k_index-1]}")

classifierObj = KNeighborsClassifier(n_neighbors=38)
classifierObj.fit(X_train_smote, y_train_smote)
y_pred = classifierObj.predict(X_test_smote)
f1 = f1_score(y_test_smote, y_pred, average='macro')
print(f"F1 Score: {f1}")


In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [
    ('RandomForestClassifier', RandomForestClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('LogisticRegression', LogisticRegression()),
    ('KNN', KNeighborsClassifier(n_neighbors=10)),
]

votingClassifier = VotingClassifier(estimators=classifiers, voting='hard')
modelclassificationreport(votingClassifier, "VotingClassifier")

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_smote, y_train_smote)
y_pred = rf.predict(X_test_smote)
print("Random Forest Classifier Accuracy:", rf.score(X_test_smote, y_test_smote))
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test_smote, y_pred)
print("Confusion Matrix:") 
print(cm)

print("Classification Report:")
print(classification_report(y_test_smote, y_pred))
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Feature Importance
feature_importances = rf.feature_importances_
indices = np.argsort(feature_importances)[::-1]
print("Feature ranking:")
for f in range(X_train_smote.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importances[indices[f]]))


In [None]:
category_col = data.select_dtypes(include='object').columns 

plt.figure(figsize=(10,6)) 
for col in category_col:
    sns.countplot(x=col,data=data,palette='magma')
    plt.ylabel('Frequency')
    plt.title(f'The distribution of {col}')
    plt.legend()
    plt.show()