In [None]:
import pandas as pd
import numpy as np

# Set a seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 10000

# Function to determine the probability of being homeless
def determine_homeless(income, employment_status, education_level, mental_health_status, housing_history, disability, social_support):
    # Base probability
    probability = 0.1  # Default 10% chance

    # Increase probability based on conditions
    if income < 10000:
        probability += 0.4  # 40% increase
    if employment_status in ['Unemployed', 'Part-time']:
        probability += 0.3  # 30% increase
    if education_level == 'No formal education':
        probability += 0.2  # 20% increase
    if mental_health_status == 'Poor':
        probability += 0.2  # 20% increase
    if housing_history == 'Homeless in the past':
        probability += 0.3  # 30% increase
    if disability == 1:
        probability += 0.2  # 20% increase
    if social_support == 'Weak':
        probability += 0.2  # 20% increase

    # Ensure the probability is capped at 1
    return np.random.rand() < min(probability, 1.0)

# Feature generation
data = {
    'age': np.random.randint(18, 80, size=n_samples),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=n_samples),
    'income_level': np.random.randint(0, 100000, size=n_samples),
    'employment_status': np.random.choice(['Employed', 'Unemployed', 'Part-time', 'Retired'], size=n_samples),
    'education_level': np.random.choice(['No formal education', 'High School', 'Associate Degree', 'Bachelor’s Degree', 'Master’s Degree'], size=n_samples),
    'mental_health_status': np.random.choice(['Good', 'Fair', 'Poor'], size=n_samples),
    'substance_abuse': np.random.choice([0, 1], size=n_samples),  # 0 = No, 1 = Yes
    'family_status': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], size=n_samples),
    'housing_history': np.random.choice(['Rented', 'Owned', 'Homeless in the past', 'Temporary shelter'], size=n_samples),
    'disability': np.random.choice([0, 1], size=n_samples),  # 0 = No, 1 = Yes
    'region': np.random.choice(['Urban', 'Suburban', 'Rural'], size=n_samples),
    'social_support': np.random.choice(['Strong', 'Moderate', 'Weak'], size=n_samples)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Determine homelessness based on conditions
df['homeless'] = df.apply(lambda row: determine_homeless(
    row['income_level'],
    row['employment_status'],
    row['education_level'],
    row['mental_health_status'],
    row['housing_history'],
    row['disability'],
    row['social_support']
), axis=1)

# Display the first few rows of the dataset
print(df.head())

# Optionally, save the dataset to a CSV file
df.to_csv('custom_homelessness_data_10000.csv', index=False)


   age  gender  income_level employment_status      education_level  \
0   56  Female          6033           Retired     Associate Degree   
1   69    Male         62461           Retired  No formal education   
2   46    Male          2665          Employed          High School   
3   32   Other         27025         Part-time  No formal education   
4   60    Male         23533         Part-time      Master’s Degree   

  mental_health_status  substance_abuse family_status       housing_history  \
0                 Good                0        Single                 Owned   
1                 Poor                0      Divorced                 Owned   
2                 Fair                1       Widowed     Temporary shelter   
3                 Good                1       Married                Rented   
4                 Poor                1        Single  Homeless in the past   

   disability    region social_support  homeless  
0           0     Rural       Moderate      Tru

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the synthetic dataset (replace with your actual dataset path)
df = pd.read_csv('custom_homelessness_data_10000.csv')

# Preprocessing: Encode categorical features using LabelEncoder
le = LabelEncoder()

# Columns that need encoding
categorical_cols = ['gender', 'employment_status', 'education_level', 'mental_health_status',
                    'family_status', 'housing_history', 'region', 'social_support']

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Split the data into features (X) and target (y)
X = df.drop(columns='homeless')  # Features
y = df['homeless']  # Target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Optional: Feature importance to see which features contribute the most
feature_importances = rf_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in zip(features, feature_importances):
    print(f"{feature}: {importance}")


Confusion Matrix:
[[ 438  338]
 [ 220 1004]]

Classification Report:
              precision    recall  f1-score   support

       False       0.67      0.56      0.61       776
        True       0.75      0.82      0.78      1224

    accuracy                           0.72      2000
   macro avg       0.71      0.69      0.70      2000
weighted avg       0.72      0.72      0.72      2000


Accuracy Score:
0.721

Feature Importances:
age: 0.16622018296135432
gender: 0.049726190846999516
income_level: 0.22093914391996683
employment_status: 0.0951399170850461
education_level: 0.08225885417702838
mental_health_status: 0.05766911181259801
substance_abuse: 0.03028779707347177
family_status: 0.06461090645453578
housing_history: 0.090230580969425
disability: 0.036603592707227114
region: 0.05075564294939438
social_support: 0.05555807904295301


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the synthetic dataset
df = pd.read_csv('custom_homelessness_data_10000.csv')

# Preprocessing: Encode categorical features using LabelEncoder
le = LabelEncoder()

# Columns that need encoding
categorical_cols = ['gender', 'employment_status', 'education_level', 'mental_health_status',
                    'family_status', 'housing_history', 'region', 'social_support']

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Split the data into features (X) and target (y)
X = df.drop(columns='homeless')  # Features
y = df['homeless']  # Target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Function to take input from the user
def user_input():
    # Collect user input for each feature
    age = int(input("Enter your age: "))
    gender = input("Enter your gender (Male/Female/Other): ")
    income_level = int(input("Enter your income level: "))
    employment_status = input("Enter your employment status (Employed/Unemployed/Part-time/Retired): ")
    education_level = input("Enter your education level (No formal education/High School/Associate Degree/Bachelor’s Degree/Master’s Degree): ")
    mental_health_status = input("Enter your mental health status (Good/Fair/Poor): ")
    substance_abuse = int(input("Have you ever had substance abuse? (1 for Yes, 0 for No): "))
    family_status = input("Enter your family status (Single/Married/Divorced/Widowed): ")
    housing_history = input("Enter your housing history (Rented/Owned/Homeless in the past/Temporary shelter): ")
    disability = int(input("Do you have a disability? (1 for Yes, 0 for No): "))
    region = input("Enter your region (Urban/Suburban/Rural): ")
    social_support = input("How strong is your social support? (Strong/Moderate/Weak): ")

    # Encode the categorical data using the same LabelEncoder that was used in training
    encoded_data = {
        'age': [age],
        'gender': [le.fit_transform([gender])[0]],
        'income_level': [income_level],
        'employment_status': [le.fit_transform([employment_status])[0]],
        'education_level': [le.fit_transform([education_level])[0]],
        'mental_health_status': [le.fit_transform([mental_health_status])[0]],
        'substance_abuse': [substance_abuse],
        'family_status': [le.fit_transform([family_status])[0]],
        'housing_history': [le.fit_transform([housing_history])[0]],
        'disability': [disability],
        'region': [le.fit_transform([region])[0]],
        'social_support': [le.fit_transform([social_support])[0]]
    }

    # Convert the input to a DataFrame
    user_df = pd.DataFrame(encoded_data)

    return user_df

# Get input from the user
user_data = user_input()

# Predict if the person is homeless
prediction = rf_model.predict(user_data)

# Output the result to the user
if prediction[0] == 1:
    print("According to the model, the person is homeless.")
else:
    print("According to the model, the person is not homeless.")


KeyboardInterrupt: Interrupted by user

In [None]:
import joblib

# Save the model
joblib.dump(rf_model, 'homeless_model.pkl')
 

['homeless_model.pkl']