#Identify the output variable.
In the provided context, the output variable is likely to be the "poverty_level." The goal of the project is to use the Proxy Means Test (PMT) method to predict and classify households based on their poverty level. Therefore, "poverty_level" is the variable that the model aims to identify or predict.

In [3]:
#Understand the type of data.
import pandas as pd 
# Load your dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('/Users/yashaswipatki/Downloads/Dataset for the project/test.csv')

# Display information about the data types of each column
data_types = df.dtypes
print("Data Types:\n", data_types)

# Display basic statistics for numerical columns
numerical_stats = df.describe()
print("Numerical Statistics:\n", numerical_stats)

# Display unique values and their counts for categorical columns
categorical_info = {}
for column in df.select_dtypes(include='object').columns:
    categorical_info[column] = df[column].value_counts()

print("Categorical Information:\n", categorical_info)


Data Types:
 Id                  object
v2a1               float64
hacdor               int64
rooms                int64
hacapo               int64
                    ...   
SQBhogar_nin         int64
SQBovercrowding    float64
SQBdependency      float64
SQBmeaned          float64
agesq                int64
Length: 142, dtype: object
Numerical Statistics:
                v2a1        hacdor         rooms        hacapo          v14a  \
count  6.453000e+03  23856.000000  23856.000000  23856.000000  23856.000000   
mean   1.748726e+05      0.050679      4.955776      0.028421      0.992748   
std    1.567887e+05      0.219346      1.539753      0.166174      0.084850   
min    0.000000e+00      0.000000      1.000000      0.000000      0.000000   
25%    8.000000e+04      0.000000      4.000000      0.000000      1.000000   
50%    1.400000e+05      0.000000      5.000000      0.000000      1.000000   
75%    2.200000e+05      0.000000      6.000000      0.000000      1.000000   
max    2

In [4]:
#3 Check if there are any biases in your dataset.
# Check for biases in the distribution of the target variable
target_variable = 'target_column'  # Replace with the actual target variable name
if target_variable in df.columns:
    bias_check = df[target_variable].value_counts(normalize=True)
    print(f"Bias in Target Variable ({target_variable}):\n", bias_check)
else:
    print(f"The target variable '{target_variable}' does not exist in the dataset.")


The target variable 'target_column' does not exist in the dataset.


In [5]:
#4 Check whether all members of the house have the same poverty level.
# Display all column names in the dataset
print("Column Names:\n", df.columns)

# Assuming 'Id' is the column representing the unique identifier for each household
# Replace 'your_actual_poverty_level_column' with the actual column name representing the poverty level
poverty_level_column = 'Id'

# Check if the poverty level column exists in the dataset
if poverty_level_column in df.columns:
    household_poverty_variation = df.groupby('Id')[poverty_level_column].nunique()

    # Display households with varying poverty levels
    varying_poverty_levels = household_poverty_variation[household_poverty_variation > 1]

    if varying_poverty_levels.empty:
        print("All members of the house have the same poverty level.")
    else:
        print("Some households have members with different poverty levels:")
        print(varying_poverty_levels)
else:
    print(f"The poverty level column '{poverty_level_column}' does not exist in the dataset.")


Column Names:
 Index(['Id', 'v2a1', 'hacdor', 'rooms', 'hacapo', 'v14a', 'refrig', 'v18q',
       'v18q1', 'r4h1',
       ...
       'age', 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe',
       'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned',
       'agesq'],
      dtype='object', length=142)
All members of the house have the same poverty level.


In [6]:
#5 Check if there is a house without a family head.
if 'parentesco1' in df.columns:
    houses_without_family_head = df[df['parentesco1'] == 1]['Id']

    if houses_without_family_head.empty:
        print("All houses have a family head.")
    else:
        print("Houses without a family head:")
        print(houses_without_family_head)
else:
    print("The column 'parentesco1' does not exist in the dataset.")

Houses without a family head:
2        ID_e5442cf6a
3        ID_a8db26a79
4        ID_a62966799
6        ID_3c5f4bd51
8        ID_472fa82da
             ...     
23836    ID_265b917e8
23841    ID_19c0b1480
23843    ID_aa256c594
23847    ID_4b7feead3
23852    ID_1a7c6953b
Name: Id, Length: 7334, dtype: object


In [7]:
#6 Set the poverty level of the members and the head of the house same in a family.
if 'parentesco1' in df.columns and 'v2a1' in df.columns:
    # Create a dictionary mapping household IDs to the poverty level of the head
    head_poverty_mapping = df[df['parentesco1'] == 1][['Id', 'v2a1']].set_index('Id')['v2a1'].to_dict()

    # Update the poverty level of the members based on the head's poverty level
    df['v2a1'] = df['Id'].map(head_poverty_mapping)

    print("Poverty levels updated successfully.")
else:
    print("One or more required columns do not exist in the dataset.")

Poverty levels updated successfully.


In [8]:
df.columns

Index(['Id', 'v2a1', 'hacdor', 'rooms', 'hacapo', 'v14a', 'refrig', 'v18q',
       'v18q1', 'r4h1',
       ...
       'age', 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe',
       'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned',
       'agesq'],
      dtype='object', length=142)

In [9]:
#7 Count how many null values are existing in columns.
null_values_count = df.isnull().sum()
null_values_count

Id                     0
v2a1               21922
hacdor                 0
rooms                  0
hacapo                 0
                   ...  
SQBhogar_nin           0
SQBovercrowding        0
SQBdependency          0
SQBmeaned             31
agesq                  0
Length: 142, dtype: int64

In [10]:
#8 Remove null value rows of the target variable.
# Check if the target variable 'poverty_level' exists in the dataset
if 'v2a1' in df.columns:
    # Remove rows with null values in the target variable
    df = df.dropna(subset=['v2a1'])

    print("Null value rows in the target variable removed successfully.")
else:
    print("The target variable 'poverty_level' does not exist in the dataset.")

Null value rows in the target variable removed successfully.


In [12]:
#9 Predict the accuracy using random forest classifier.
from sklearn.impute import SimpleImputer

# Check if the target variable 'poverty_level' exists in the dataset
if 'v2a1' in df.columns:
    # Assuming 'poverty_level' is the target variable and other columns are features
    X = df.dropna(subset=['v2a1'])
    y = df['v2a1']

    # Simple one-hot encoding for categorical columns
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median' or 'most_frequent'
    X_imputed = imputer.fit_transform(X_encoded)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # Create a Random Forest Classifier
    rf_classifier = RandomForestClassifier()

    # Fit the model on the training data
    rf_classifier.fit(X_train, y_train)

    # Predict the target variable on the testing data
    predictions = rf_classifier.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)

    print("Accuracy using Random Forest Classifier:", accuracy)
else:
    print("The target variable 'v2a1' does not exist in the dataset.")




Accuracy using Random Forest Classifier: 0.12919896640826872


In [14]:
#10 Check the accuracy using a random forest with cross-validation.

from sklearn.model_selection import cross_val_score


# Check if the target variable 'poverty_level' exists in the dataset
if 'v2a1' in df.columns:
    # Assuming 'poverty_level' is the target variable and other columns are features
    X = df.dropna(subset=['v2a1'])
    y = df['v2a1']

    # Simple one-hot encoding for categorical columns
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median' or 'most_frequent'
    X_imputed = imputer.fit_transform(X_encoded)

    # Create a Random Forest Classifier
    rf_classifier = RandomForestClassifier()

    # Use cross_val_score for cross-validation
    cross_val_scores = cross_val_score(rf_classifier, X_imputed, y, cv=5)  # Adjust cv as needed

    # Print cross-validation scores
    print("Cross-validation scores:", cross_val_scores)

    # Calculate and print mean accuracy
    mean_accuracy = cross_val_scores.mean()
    print("Mean Accuracy using Random Forest Classifier with Cross-Validation:", mean_accuracy)
else:
    print("The target variable 'v2a1' does not exist in the dataset.")




Cross-validation scores: [0.15245478 0.16795866 0.13953488 0.12403101 0.12694301]
Mean Accuracy using Random Forest Classifier with Cross-Validation: 0.14218446666934437
