# Hair Health Prediction

### Importing Libraries

In [27]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn import preprocessing  
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# for accuracy check
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import joblib

### Getting our Data

In [5]:
df = pd.read_csv(r'Predict Hair Fall.csv')
df

Unnamed: 0,Id,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,133992,Yes,No,No Data,No Data,Magnesium deficiency,Moderate,19,Yes,Yes,No,No,0
1,148393,No,No,Eczema,Antibiotics,Magnesium deficiency,High,43,Yes,Yes,No,No,0
2,155074,No,No,Dermatosis,Antifungal Cream,Protein deficiency,Moderate,26,Yes,Yes,No,Yes,0
3,118261,Yes,Yes,Ringworm,Antibiotics,Biotin Deficiency,Moderate,46,Yes,Yes,No,No,0
4,111915,No,No,Psoriasis,Accutane,Iron deficiency,Moderate,30,No,Yes,Yes,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,184367,Yes,No,Seborrheic Dermatitis,Rogaine,Vitamin A Deficiency,Low,33,Yes,Yes,Yes,Yes,1
995,164777,Yes,Yes,No Data,Accutane,Protein deficiency,Low,47,No,No,No,Yes,0
996,143273,No,Yes,Androgenetic Alopecia,Antidepressants,Protein deficiency,Moderate,20,Yes,No,Yes,Yes,1
997,169123,No,Yes,Dermatitis,Immunomodulators,Biotin Deficiency,Moderate,32,Yes,Yes,Yes,Yes,1


### Data Preprocessing

In [6]:
# to see number of missing elements, use df.isnull().sum()
df.isnull().any() 

Id                           False
Genetics                     False
Hormonal Changes             False
Medical Conditions           False
Medications & Treatments     False
Nutritional Deficiencies     False
Stress                       False
Age                          False
Poor Hair Care Habits        False
Environmental Factors        False
Smoking                      False
Weight Loss                  False
Hair Loss                    False
dtype: bool

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Id                         999 non-null    int64 
 1   Genetics                   999 non-null    object
 2   Hormonal Changes           999 non-null    object
 3   Medical Conditions         999 non-null    object
 4   Medications & Treatments   999 non-null    object
 5   Nutritional Deficiencies   999 non-null    object
 6   Stress                     999 non-null    object
 7   Age                        999 non-null    int64 
 8   Poor Hair Care Habits      999 non-null    object
 9   Environmental Factors      999 non-null    object
 10  Smoking                    999 non-null    object
 11  Weight Loss                999 non-null    object
 12  Hair Loss                  999 non-null    int64 
dtypes: int64(3), object(10)
memory usage: 101.6+ KB


In [8]:
df.columns

Index(['Id', 'Genetics', 'Hormonal Changes', 'Medical Conditions',
       'Medications & Treatments', 'Nutritional Deficiencies ', 'Stress',
       'Age', 'Poor Hair Care Habits ', 'Environmental Factors', 'Smoking',
       'Weight Loss ', 'Hair Loss'],
      dtype='object')

In [9]:
obj_columns = ['Genetics', 'Hormonal Changes', 'Medical Conditions', 'Medications & Treatments',
               'Nutritional Deficiencies ', 'Stress', 'Poor Hair Care Habits ', 'Environmental Factors',
               'Smoking', 'Weight Loss ']

for i in obj_columns:
    label_encoder = preprocessing.LabelEncoder()
    df[i] = label_encoder.fit_transform(df[i])  

df

Unnamed: 0,Id,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,133992,1,0,5,8,2,2,19,1,1,0,0,0
1,148393,0,0,4,1,2,0,43,1,1,0,0,0
2,155074,0,0,3,3,5,2,26,1,1,0,1,0
3,118261,1,1,7,1,0,2,46,1,1,0,0,0
4,111915,0,0,6,0,1,2,30,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,184367,1,0,9,9,7,1,33,1,1,1,1,1
995,164777,1,1,5,0,5,1,47,0,0,0,1,0
996,143273,0,1,1,2,5,2,20,1,0,1,1,1
997,169123,0,1,2,7,0,2,32,1,1,1,1,1


In [10]:
# dropping unnecessary column
df.drop(['Id'], axis=1, inplace=True)
df

Unnamed: 0,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,1,0,5,8,2,2,19,1,1,0,0,0
1,0,0,4,1,2,0,43,1,1,0,0,0
2,0,0,3,3,5,2,26,1,1,0,1,0
3,1,1,7,1,0,2,46,1,1,0,0,0
4,0,0,6,0,1,2,30,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
994,1,0,9,9,7,1,33,1,1,1,1,1
995,1,1,5,0,5,1,47,0,0,0,1,0
996,0,1,1,2,5,2,20,1,0,1,1,1
997,0,1,2,7,0,2,32,1,1,1,1,1


In [11]:
df.columns

Index(['Genetics', 'Hormonal Changes', 'Medical Conditions',
       'Medications & Treatments', 'Nutritional Deficiencies ', 'Stress',
       'Age', 'Poor Hair Care Habits ', 'Environmental Factors', 'Smoking',
       'Weight Loss ', 'Hair Loss'],
      dtype='object')

In [12]:
# checking for variance inflation factor
variables = df[['Genetics', 'Hormonal Changes', 'Medical Conditions',
       'Medications & Treatments', 'Nutritional Deficiencies ', 'Stress',
       'Age', 'Poor Hair Care Habits ', 'Environmental Factors', 'Smoking',
       'Weight Loss ']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

In [13]:
vif

Unnamed: 0,VIF,Features
0,2.001465,Genetics
1,1.955988,Hormonal Changes
2,3.102401,Medical Conditions
3,2.983372,Medications & Treatments
4,3.449854,Nutritional Deficiencies
5,2.495439,Stress
6,8.064002,Age
7,1.927843,Poor Hair Care Habits
8,1.9946,Environmental Factors
9,2.044023,Smoking


In [14]:
# vif is <10 so we can consider all the attributes

### Splitting Data for Training and Testing

In [15]:
data = df.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, shuffle=True)  # splitting in the ratio 70:30

### Decision Tree Model

In [16]:
clf = DecisionTreeClassifier(max_depth=5)
clf = clf.fit(X_train, y_train)

### Evaluating Model Performance

In [17]:
y_train_pred = clf.predict(X_train)
print('Training Accuracy:', accuracy_score(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing Accuracy:', accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.6280400572246065
Testing Accuracy: 0.5266666666666666


### Cross Validation

In [48]:
df = pd.read_csv(r'Predict Hair Fall.csv')
df

Unnamed: 0,Id,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,133992,Yes,No,No Data,No Data,Magnesium deficiency,Moderate,19,Yes,Yes,No,No,0
1,148393,No,No,Eczema,Antibiotics,Magnesium deficiency,High,43,Yes,Yes,No,No,0
2,155074,No,No,Dermatosis,Antifungal Cream,Protein deficiency,Moderate,26,Yes,Yes,No,Yes,0
3,118261,Yes,Yes,Ringworm,Antibiotics,Biotin Deficiency,Moderate,46,Yes,Yes,No,No,0
4,111915,No,No,Psoriasis,Accutane,Iron deficiency,Moderate,30,No,Yes,Yes,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,184367,Yes,No,Seborrheic Dermatitis,Rogaine,Vitamin A Deficiency,Low,33,Yes,Yes,Yes,Yes,1
995,164777,Yes,Yes,No Data,Accutane,Protein deficiency,Low,47,No,No,No,Yes,0
996,143273,No,Yes,Androgenetic Alopecia,Antidepressants,Protein deficiency,Moderate,20,Yes,No,Yes,Yes,1
997,169123,No,Yes,Dermatitis,Immunomodulators,Biotin Deficiency,Moderate,32,Yes,Yes,Yes,Yes,1


In [49]:
df.drop(['Id'], axis=1, inplace=True)
# df = pd.get_dummies(df)

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

print(df[categorical_columns])

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical columns
df = df_encoded.drop(categorical_columns, axis=1)

df

    Genetics Hormonal Changes     Medical Conditions  \
0        Yes               No                No Data   
1         No               No                 Eczema   
2         No               No             Dermatosis   
3        Yes              Yes               Ringworm   
4         No               No              Psoriasis   
..       ...              ...                    ...   
994      Yes               No  Seborrheic Dermatitis   
995      Yes              Yes                No Data   
996       No              Yes  Androgenetic Alopecia   
997       No              Yes             Dermatitis   
998      Yes              Yes              Psoriasis   

      Medications & Treatments Nutritional Deficiencies     Stress  \
0                      No Data      Magnesium deficiency  Moderate   
1                  Antibiotics      Magnesium deficiency      High   
2             Antifungal Cream        Protein deficiency  Moderate   
3                  Antibiotics        Biotin De

Unnamed: 0,Age,Hair Loss,Genetics_No,Genetics_Yes,Hormonal Changes_No,Hormonal Changes_Yes,Medical Conditions_Alopecia Areata,Medical Conditions_Androgenetic Alopecia,Medical Conditions_Dermatitis,Medical Conditions_Dermatosis,...,Stress_Low,Stress_Moderate,Poor Hair Care Habits _No,Poor Hair Care Habits _Yes,Environmental Factors_No,Environmental Factors_Yes,Smoking_No,Smoking_Yes,Weight Loss _No,Weight Loss _Yes
0,19,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,43,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,26,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
3,46,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,30,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,33,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
995,47,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
996,20,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
997,32,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [33]:
# data = df.values
# X, y = data[:,:-1], data[:,-1]
X = df.drop(columns=['Hair Loss'])  # Drop the target column
y = df['Hair Loss']  # Select only the target column

X = X.values
y = y.values

In [34]:
# we'll use K-Fold cross validation technique for selecting training & testing sets randomly for achieving the maximum accuracy
kf = KFold(shuffle=True, random_state=100, n_splits=30)

In [82]:
scores = []
best_accuracy = 0.0
best_model = None
    
# splitting for training & testing randomly
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index] 

    clf = RandomForestClassifier(max_depth=50)
    clf = clf.fit(X_train, y_train)   
    y_test_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_test_pred)
    scores.append(score)

    if score > best_accuracy:
        best_accuracy = score
        best_model = clf
        joblib.dump(best_model, 'hair_health_prediction_model.pkl')
    
print(scores)
print('Max Accuracy Score:', round(max(scores), 4))

[0.5, 0.38235294117647056, 0.5, 0.5294117647058824, 0.3235294117647059, 0.47058823529411764, 0.35294117647058826, 0.4411764705882353, 0.7352941176470589, 0.5757575757575758, 0.42424242424242425, 0.45454545454545453, 0.5151515151515151, 0.5151515151515151, 0.45454545454545453, 0.6363636363636364, 0.48484848484848486, 0.42424242424242425, 0.48484848484848486, 0.45454545454545453, 0.48484848484848486, 0.42424242424242425, 0.48484848484848486, 0.5151515151515151, 0.48484848484848486, 0.36363636363636365, 0.45454545454545453, 0.3939393939393939, 0.6363636363636364, 0.6666666666666666]
Max Accuracy Score: 0.7353


### Making Predictions

In [109]:
model = joblib.load('hair_health_prediction_model.pkl')

In [110]:
categorical_columns = [
    'Genetics', 'Hormonal Changes', 'Medical Conditions',
       'Medications & Treatments', 'Nutritional Deficiencies ', 'Stress', 'Poor Hair Care Habits ', 'Environmental Factors', 'Smoking',
       'Weight Loss '
]

# Collect user input for each categorical column
data = {'Genetics': ['Yes'],
 'Hormonal Changes': ['No'],
 'Medical Conditions': ['No Data'],
 'Medications & Treatments': ['No Data'],
 'Nutritional Deficiencies ': ['Iron deficiency'],
 'Stress': ['Low'],
 'Poor Hair Care Habits ': ['No'],
 'Environmental Factors': ['No'],
 'Smoking': ['Yes'],
 'Weight Loss ': ['No']}

# Create DataFrame from user input
new_data = pd.DataFrame(data)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.transform(new_data)

# Create DataFrame from one-hot encoded data
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(new_data.columns))

print("One-hot encoded DataFrame:")
len(one_hot_encoded[0])

One-hot encoded DataFrame:


48

In [111]:
one_hot_encoded_df

Unnamed: 0,Genetics_No,Genetics_Yes,Hormonal Changes_No,Hormonal Changes_Yes,Medical Conditions_Alopecia Areata,Medical Conditions_Androgenetic Alopecia,Medical Conditions_Dermatitis,Medical Conditions_Dermatosis,Medical Conditions_Eczema,Medical Conditions_No Data,...,Stress_Low,Stress_Moderate,Poor Hair Care Habits _No,Poor Hair Care Habits _Yes,Environmental Factors_No,Environmental Factors_Yes,Smoking_No,Smoking_Yes,Weight Loss _No,Weight Loss _Yes
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [112]:
data

{'Genetics': ['Yes'],
 'Hormonal Changes': ['No'],
 'Medical Conditions': ['No Data'],
 'Medications & Treatments': ['No Data'],
 'Nutritional Deficiencies ': ['Iron deficiency'],
 'Stress': ['Low'],
 'Poor Hair Care Habits ': ['No'],
 'Environmental Factors': ['No'],
 'Smoking': ['Yes'],
 'Weight Loss ': ['No']}

In [113]:
# Method 2: Using insert method
one_hot_encoded_df.insert(0, 'Age', 22)
one_hot_encoded_df

Unnamed: 0,Age,Genetics_No,Genetics_Yes,Hormonal Changes_No,Hormonal Changes_Yes,Medical Conditions_Alopecia Areata,Medical Conditions_Androgenetic Alopecia,Medical Conditions_Dermatitis,Medical Conditions_Dermatosis,Medical Conditions_Eczema,...,Stress_Low,Stress_Moderate,Poor Hair Care Habits _No,Poor Hair Care Habits _Yes,Environmental Factors_No,Environmental Factors_Yes,Smoking_No,Smoking_Yes,Weight Loss _No,Weight Loss _Yes
0,22,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [114]:
prediction = model.predict(one_hot_encoded_df)
print('Less likely to have hairfall.') if prediction[0] == 0 else print('More likely to have hairfall.')

Less likely to have hairfall.




In [92]:
# joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [95]:
a = pd.read_csv('Predict Hair Fall.csv')
b = list(a.columns)
b

['Id',
 'Genetics',
 'Hormonal Changes',
 'Medical Conditions',
 'Medications & Treatments',
 'Nutritional Deficiencies ',
 'Stress',
 'Age',
 'Poor Hair Care Habits ',
 'Environmental Factors',
 'Smoking',
 'Weight Loss ',
 'Hair Loss']

In [107]:
print(list(a['Weight Loss '].unique()))

['No', 'Yes']
