### Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Loading the dataset 

In [None]:
# Loading the dataset into pandas dataframe
df = pd.read_csv('survey.csv')

### Knowing the data

In [None]:
print("Shape:", df.shape)  #Shows the number of rows and columns

In [None]:
df.sample(10)  #Shows data from 10 random rows

In [None]:
df.info()  #Shows the data type of each column

In [None]:
df.isnull().sum()  #Shows the number of null values in each column

### Cleaning the dataset

#### Unnecessary columns
Columns - "Timestamp", "state", "comments" contain a lot of null values and are of less importance for our data model.  
Thus these are to be dropped from the dataset.

In [None]:
# Dropping 'Timestamp', 'state', 'comments' columns :
df = df.drop(columns=['Timestamp', 'state', 'comments'])

#### Age column
Values in the'Age' column need to brought between a valid range i.e., 0 to 100.

In [None]:
# Clean 'Age' column
df.drop(df[df['Age'] < 0].index, inplace=True)
df.drop(df[df['Age'] > 100].index, inplace=True)

#### Gender column
Gender is a categorical entity so it should ideally have only three categories : Female, Male and Others(involving categories other than female and male).  
So, the entries in the 'Gender' column need to be mapped with the above categories.

In [None]:
df['Gender'].replace(['Male ', 'male', 'M', 'm', 'Male', 'Cis Male',
                     'Man', 'cis male', 'Mail', 'Male-ish', 'Male (CIS)',
                     'Cis Man', 'msle', 'Malr', 'Mal', 'maile', 'Make'], 'Male', inplace=True)

df['Gender'].replace(['Female ', 'female', 'F', 'f', 'Woman', 'Female',
                     'femail', 'Cis Female', 'cis-female/femme', 'Femake', 'Female (cis)',
                     'woman'], 'Female', inplace=True)

df['Gender'].replace(['Female (trans)', 'queer/she/they', 'non-binary',
                     'fluid', 'queer', 'Androgyne', 'Trans-female', 'male leaning androgynous',
                     'Agender', 'A little about you', 'Nah', 'All',
                     'ostensibly male, unsure what that really means',
                     'Genderqueer', 'Enby', 'p', 'Neuter', 'something kinda male?',
                     'Guy (-ish) ^_^', 'Trans woman'], 'Other', inplace=True)
df['Gender'].value_counts()

### Handling missing values

In [None]:
# Filling the missing values
df.isnull().sum()
df['self_employed'].fillna('No', inplace=True)
df['work_interfere'].fillna('N/A', inplace=True)


#### Checking for the uniqueness of data

In [None]:
# Checking the number of unique values in each column
print("Duplicates:", df.duplicated().sum())
df = df.drop_duplicates()

# Checking the number of unique categories in each column
for i in df.columns:
    print(f"{i} : {df[i].unique()}")

#### Saving the cleaned dataset 

In [134]:
df_cleaned=df
df_cleaned.to_csv("cleaned_data.csv", index=False)

### Visualizing the data

#### Countplot and Histoplot

In [None]:
# Age distribution

sns.histplot(df['Age'],kde=True, bins=25)
plt.title('Age Distribution')
plt.xlabel('Age'),
plt.ylabel('Count')
plt.show()

In [None]:
# Gender distribution

sns.countplot(x='Gender', hue='treatment',data=df, order=df['Gender'].value_counts().index)
plt.title('Gender Distribution')
plt.ylabel('Count')
plt.xlabel('Gender')
plt.show()

In [None]:
# Treatment distribution
sns.countplot(x='treatment', data=df)
plt.title('Distribution of Seeking Treatment')
plt.xlabel('Sought Treatment for Mental Health')
plt.ylabel('Count')
plt.show()

In [None]:
#Self Employment Distribution
sns.countplot(x='self_employed', hue='treatment',data=df, order=df['self_employed'].value_counts().index)
plt.title('Self Employment Distribution')
plt.xlabel('Self Employed')
plt.ylabel('count')
plt.show()

In [None]:
#Family History Distribution
sns.countplot(x='family_history', hue='treatment', data=df, order=df['family_history'].value_counts().index)
plt.title('Family History Distribution')
plt.xlabel('Family History')
plt.ylabel('count')
plt.show()


In [None]:
#Work Interference Distribution
sns.countplot(x='work_interfere', hue='treatment', data=df, order=df['work_interfere'].value_counts().index)
plt.title('Work Interference Distribution')
plt.xlabel('Work Interference')
plt.ylabel('count')
plt.show()


In [None]:
# Work Interference Distribution
sns.countplot(x='no_employees', hue='treatment',data=df, order=df['no_employees'].value_counts().index)
plt.title('Number of Employees Distribution')
plt.xlabel('Number of Employees')
plt.ylabel('count')
plt.show()


In [None]:
#Remote Work Distribution
sns.countplot(x='remote_work', hue='treatment',data=df, order=df['remote_work'].value_counts().index)
plt.title('Remote Work Distribution')
plt.xlabel('remote_work')
plt.ylabel('count')
plt.show()


In [None]:
#Tech Company Distribution
sns.countplot(x='tech_company', hue='treatment',data=df, order=df['tech_company'].value_counts().index)
plt.title('Tech Company Distribution')
plt.xlabel('Tech Company')
plt.ylabel('count')
plt.show()


In [None]:
# Relationship between company benefits and seeking treatment
sns.countplot(x='benefits', hue='treatment', data=df)
plt.title('Effect of Company Benefits on Seeking Treatment')
plt.xlabel('Does the company provide mental health benefits?')
plt.ylabel('Count')
plt.legend(title='Sought Treatment')
plt.show()


In [None]:
# Relationship between wellness programs and seeking treatment
sns.countplot(x='wellness_program', hue='treatment', data=df)
plt.title('Effect of Wellness Programs on Seeking Treatment')
plt.xlabel('Has the company discussed mental health as part of a wellness program?')
plt.ylabel('Count')
plt.legend(title='Sought Treatment')
plt.show()

In [None]:
# Relationship between ease of taking medical leave and seeking treatment
sns.countplot(x='leave', hue='treatment', data=df, order=['Very easy', 'Somewhat easy', "Don't know", 'Somewhat difficult', 'Very difficult'])
plt.title('Effect of Leave Policy on Seeking Treatment')
plt.xlabel('How easy is it to take medical leave for a mental health condition?')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Sought Treatment')
plt.show()

### Encoding Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder

# Creating a copy for encoding :
df_encoded = df.copy()

In [None]:
# Applying label encoding to all categorical columns : 
for col in df_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [None]:
# Calculate the correlation matrix
corr_matrix = df_encoded.corr()

In [None]:
# Plot the heatmap
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix of All Features')
plt.show()

In [None]:
# Focusing on correlations with "treatment"
corr_treatment = corr_matrix['treatment'].sort_values(ascending=False)
print("Top correlations with seeking treatment:")
print(corr_treatment.head(5))