In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from user import User
from pymongo import MongoClient
import numpy as np

In [None]:
# Connect to MongoDB
client = MongoClient('mongodb://127.0.0.1:27017/')
db = client['survey_database']
collection = db['user_data']

In [None]:
# Fetch data from MongoDB and create User objects
users = []
for data in collection.find():
    user = User(
        name=data['name'],
        email=data['email'],
        age=data['age'],
        gender=data['gender'],
        marital_status=data['marital_status'],
        education=data['education'],
        employment_status=data['employment_status'],
        occupation=data['occupation'],
        income=data['income'],
        expenses=data['expenses'],
        has_health_insurance=data['health_insurance']['has_insurance'],
        insurance_type=data['health_insurance']['insurance_type']
    )
    users.append(user)
    
# Convert User objects to a list of dictionaries
user_dicts = [user.to_dict() for user in users]

# Create a DataFrame and save to CSV
df = pd.DataFrame(user_dicts)
df.to_csv('survey_data.csv', index=False)
print("Data saved to survey_data.csv")

In [None]:
# Load the CSV file
df = pd.read_csv('survey_data.csv')
print("Data loaded from survey_data.csv")

# Define expense categories
expense_categories = ['utilities', 'shopping', 'healthcare', 'entertainment', 'school_fees']

In [None]:
import os

images_folder = 'visualizations'
os.makedirs(images_folder, exist_ok=True)

# 1. Income Distribution by Age
plt.figure(figsize=(12, 6))
plt.scatter(df['age'], df['income'])
plt.title('Income Distribution by Age')
plt.xlabel('Age')
plt.ylabel('Income')
plt.savefig(os.path.join(images_folder, 'income_by_age.png'))
plt.show()

# 2. Gender distribution across spending categories
plt.figure(figsize=(12, 6))
for category in expense_categories:
    male_avg = df[df['gender'] == 'male'][category].mean()
    female_avg = df[df['gender'] == 'female'][category].mean()
    
    plt.bar(category + '_male', male_avg, label='Male' if category == expense_categories[0] else '')
    plt.bar(category + '_female', female_avg, label='Female' if category == expense_categories[0] else '')

plt.title('Average Spending by Gender and Category')
plt.xlabel('Expense Category')
plt.ylabel('Average Spending')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(images_folder, 'spending_by_gender.png'))
plt.show()

# 3. Income Distribution by Education Level
plt.figure(figsize=(12, 6))
sns.boxplot(x='education', y='income', data=df)
plt.title('Income Distribution by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Income')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(images_folder, 'income_by_education.png'))
plt.show()

# 4. Employment Status and Income
plt.figure(figsize=(10, 6))
sns.boxplot(x='employment_status', y='income', data=df)
plt.title('Income Distribution by Employment Status')
plt.xlabel('Employment Status')
plt.ylabel('Income')
plt.savefig(os.path.join(images_folder, 'income_by_employment_status.png'))
plt.show()

# 5. Marital Status and Expenses
plt.figure(figsize=(12, 6))
df.groupby('marital_status')[expense_categories].mean().plot(kind='bar', stacked=True)
plt.title('Average Expenses by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Average Expenses')
plt.legend(title='Expense Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(images_folder, 'expenses_by_marital_status.png'))
plt.show()

# 6. Health Insurance Status and Healthcare Expenses
plt.figure(figsize=(10, 6))
sns.boxplot(x='has_health_insurance', y='healthcare', data=df)
plt.title('Healthcare Expenses by Insurance Status')
plt.xlabel('Has Health Insurance')
plt.ylabel('Healthcare Expenses')
plt.savefig(os.path.join(images_folder, 'healthcare_expenses_by_insurance.png'))
plt.show()

# 7. Occupation and Income
plt.figure(figsize=(12, 6))
occupation_income = df.groupby('occupation')['income'].mean().sort_values(ascending=False)
occupation_income.plot(kind='bar')
plt.title('Average Income by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Average Income')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(images_folder, 'income_by_occupation.png'))
plt.show()

# 8. Insurance Type Distribution
plt.figure(figsize=(10, 6))
df['insurance_type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Insurance Types')
plt.ylabel('')
plt.savefig(os.path.join(images_folder, 'insurance_type_distribution.png'))
plt.show()

# 9. Correlation Matrix
numeric_columns = ['age', 'income'] + expense_categories
correlation_matrix = df[numeric_columns].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.savefig(os.path.join(images_folder, 'correlation_matrix.png'))
plt.show()

# 10. Age Distribution by Employment Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='employment_status', y='age', data=df)
plt.title('Age Distribution by Employment Status')
plt.xlabel('Employment Status')
plt.ylabel('Age')
plt.savefig(os.path.join(images_folder, 'age_by_employment_status.png'))
plt.show()

# 11. Education Level and Health Insurance Status
education_insurance = pd.crosstab(df['education'], df['has_health_insurance'], normalize='index')
education_insurance.plot(kind='bar', stacked=True)
plt.title('Health Insurance Status by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Proportion')
plt.legend(title='Has Health Insurance')
plt.tight_layout()
plt.savefig(os.path.join(images_folder, 'insurance_by_education.png'))
plt.show()

print("All visualizations saved as PNG files in the 'images' folder.")


In [None]:
# Summary statistics
summary_stats = df.describe()
summary_stats.to_csv('summary_statistics.csv')
print("Summary statistics saved to 'summary_statistics.csv'")

# Additional insights
print("\nAdditional Insights:")
print(f"Average income: ${df['income'].mean():.2f}")
print(f"Median income: ${df['income'].median():.2f}")
print(f"Most common occupation: {df['occupation'].mode().values[0]}")
print(f"Percentage with health insurance: {(df['has_health_insurance'] == 'yes').mean()*100:.1f}%")
print(f"Education level with highest average income: {df.groupby('education')['income'].mean().idxmax()}")
print(f"Employment status with highest average income: {df.groupby('employment_status')['income'].mean().idxmax()}")