In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the file path
file_path = "C:/Users/Admin/Favorites/Documents/TECHNICAL PROGRAMMING II/settle2.csv"

# Load the dataset from a CSV file
data = pd.read_csv(file_path, skiprows=4)  # Adjust skiprows if the initial rows contain metadata

# Understanding the dataset
print("Dataset Size: ", data.shape)
print("\nColumn Names: ", data.columns)
print("\nData Types:\n", data.dtypes)
print("\nFirst 5 Rows of the Dataset:\n", data.head())

# Identifying the missing data
missingData = data.isnull().sum()
print("\nMissing Data Count:\n", missingData)

# Normalize a specific column ('Appel Boord')
column_name = 'Appel Boord'
if column_name in data.columns:
    data[column_name] = pd.to_numeric(data[column_name], errors='coerce')
    
    # Handle missing values by filling with the mean
    mean_value = data[column_name].mean()
    data[column_name] = data[column_name].fillna(mean_value)
    
    # Ensure there are no NaN values left
    if data[column_name].isnull().sum() == 0:
        print(f"\nNo NaN values in '{column_name}' after filling.")
    else:
        print(f"\nNaN values still present in '{column_name}' after filling.")
    
    # Check if normalization is needed
    print("\nSummary Statistics Before Normalization:\n", data[column_name].describe())
    
    # Normalize the data using StandardScaler
    scaler = StandardScaler()
    data['temp_sanitation'] = scaler.fit_transform(data[[column_name]])
    
    print("\nSummary Statistics After Normalization:\n", data['temp_sanitation'].describe())
else:
    print(f"\nColumn '{column_name}' not found.")

# Data Slicing and Indexing
if 'temp_sanitation' in data.columns and 'age_of_is' in data.columns:
    country_data = data[(data['City land. Can be upgraded'] == 'India') & (data['Year'] >= 2000) & (data['Year'] <= 2020)]
    print("\nData for India from 2000 to 2020:\n", country_data)
else:
    print("\nColumns 'City land. Can be upgraded' or 'Year' not found.")

# Using columns specified in the dataset for operations
specified_columns = [
    "is_cluster", "pocket_name", "pocket_alias", "ward_id", "ward_councillor", "avg_hhsize", "area_ha", 
    "density_dwellings", "structure_count", "age_of_is", "upgrade_category", "temp_sanitation", "toilets_full_flush", 
    "toilets_chemical", "toilets_portable", "toilets_container", "toilets_other", "toilets_total", 
    "ratio_toilets_dwellings", "pocket_code", "centroidx", "centroidy"
]

# Print specified columns
if all(col in data.columns for col in specified_columns):
    print("\nSelected Columns:\n", data[specified_columns].head())
else:
    missing_cols = [col for col in specified_columns if col not in data.columns]
    print(f"\nMissing Columns: {missing_cols}")

# Example operation using specified columns (e.g., analyzing average household size)
if 'avg_hhsize' in data.columns:
    avg_hhsize_mean = data['avg_hhsize'].mean()
    print(f"\nAverage Household Size: {avg_hhsize_mean}")
else:
    print("\nColumn 'avg_hhsize' not found.")


Dataset Size:  (430, 22)

Column Names:  Index(['Appel Boord', 'Appel Boord.1', 'None', '19100022', 'Johanna Martlow',
       '4.140000000', '0.088000000', '159.090909091', '14', '5 - 10 years',
       'City land. Can be upgraded', '0E-9', '10.000000000', '0E-9.1',
       '0E-9.2', '0E-9.3', '0E-9.4', '10.000000000.1', '1:1', 'TYGAPP001',
       '18.609142000', '-33.926268000'],
      dtype='object')

Data Types:
 Appel Boord                    object
Appel Boord.1                  object
None                           object
19100022                        int64
Johanna Martlow                object
4.140000000                   float64
0.088000000                   float64
159.090909091                 float64
14                              int64
5 - 10 years                   object
City land. Can be upgraded     object
0E-9                          float64
10.000000000                  float64
0E-9.1                        float64
0E-9.2                        float64
0E-9.3      

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


TypeError: Could not convert string '10 - 15 years5 - 10 years5 - 10 years10 - 15 years> 20 years10 - 15 years10 - 15 years15 - 20 years5 - 10 years15 - 20 years15 - 20 years> 20 years> 20 years> 20 years15 - 20 years15 - 20 years> 20 years> 20 years15 - 20 years0 - 5 years0 - 5 years> 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years15 - 20 years10 - 15 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years5 - 10 years15 - 20 years5 - 10 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years15 - 20 years15 - 20 years> 20 years> 20 years15 - 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years15 - 20 years15 - 20 years15 - 20 years5 - 10 years10 - 15 years0 - 5 years15 - 20 years10 - 15 years0 - 5 years0 - 5 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years15 - 20 years15 - 20 years10 - 15 years15 - 20 years0 - 5 years15 - 20 years> 20 years10 - 15 years10 - 15 years10 - 15 years10 - 15 years15 - 20 years> 20 years5 - 10 years5 - 10 years5 - 10 years15 - 20 years10 - 15 years15 - 20 years5 - 10 years5 - 10 years5 - 10 years15 - 20 years5 - 10 years5 - 10 years> 20 years15 - 20 years15 - 20 years10 - 15 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years10 - 15 years15 - 20 years5 - 10 years> 20 years15 - 20 years5 - 10 years10 - 15 years15 - 20 years10 - 15 years15 - 20 years0 - 5 years5 - 10 years15 - 20 years> 20 years> 20 years> 20 years5 - 10 years10 - 15 years15 - 20 years15 - 20 years10 - 15 years10 - 15 years15 - 20 years10 - 15 years15 - 20 years10 - 15 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years> 20 years15 - 20 years> 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years5 - 10 years15 - 20 years> 20 years10 - 15 years15 - 20 years10 - 15 years10 - 15 years10 - 15 years15 - 20 years5 - 10 years15 - 20 years> 20 years> 20 years5 - 10 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years> 20 years15 - 20 years15 - 20 years5 - 10 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years> 20 years15 - 20 years10 - 15 years> 20 years10 - 15 years15 - 20 years10 - 15 years15 - 20 years> 20 years15 - 20 years15 - 20 years> 20 years> 20 years> 20 years> 20 years15 - 20 years15 - 20 years10 - 15 years5 - 10 years0 - 5 years15 - 20 years5 - 10 years15 - 20 years10 - 15 years10 - 15 years5 - 10 years15 - 20 years15 - 20 years5 - 10 years10 - 15 years15 - 20 years10 - 15 years10 - 15 years15 - 20 years5 - 10 years5 - 10 years5 - 10 years15 - 20 years5 - 10 years5 - 10 years0 - 5 years5 - 10 years5 - 10 years5 - 10 years5 - 10 years5 - 10 years5 - 10 years5 - 10 years> 20 years5 - 10 years> 20 years5 - 10 years5 - 10 years10 - 15 years10 - 15 years5 - 10 years10 - 15 years15 - 20 years10 - 15 years10 - 15 years15 - 20 years15 - 20 years10 - 15 years15 - 20 years0 - 5 years10 - 15 years> 20 years15 - 20 years15 - 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years15 - 20 years15 - 20 years5 - 10 years> 20 years15 - 20 years15 - 20 years> 20 years> 20 years15 - 20 years10 - 15 years15 - 20 years15 - 20 years5 - 10 years10 - 15 years> 20 years5 - 10 years10 - 15 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years10 - 15 years> 20 years10 - 15 years5 - 10 years10 - 15 years> 20 years15 - 20 years15 - 20 years15 - 20 years10 - 15 years0 - 5 years0 - 5 years15 - 20 years15 - 20 years15 - 20 years> 20 years10 - 15 years10 - 15 years10 - 15 years10 - 15 years10 - 15 years10 - 15 years15 - 20 years10 - 15 years10 - 15 years15 - 20 years10 - 15 years15 - 20 years10 - 15 years5 - 10 years10 - 15 years10 - 15 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years0 - 5 years> 20 years0 - 5 years15 - 20 years10 - 15 years> 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years> 20 years> 20 years10 - 15 years> 20 years> 20 years5 - 10 years15 - 20 years15 - 20 years10 - 15 years15 - 20 years15 - 20 years5 - 10 years15 - 20 years5 - 10 years> 20 years10 - 15 years10 - 15 years> 20 years15 - 20 years5 - 10 years0 - 5 years> 20 years15 - 20 years> 20 years15 - 20 years15 - 20 years10 - 15 years15 - 20 years15 - 20 years5 - 10 years> 20 years> 20 years5 - 10 years10 - 15 years> 20 years> 20 years> 20 years> 20 years15 - 20 years> 20 years> 20 years> 20 years> 20 years> 20 years> 20 years5 - 10 years15 - 20 years> 20 years> 20 years5 - 10 years15 - 20 years> 20 years> 20 years10 - 15 years5 - 10 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years> 20 years0 - 5 years5 - 10 years15 - 20 years5 - 10 years10 - 15 years15 - 20 years15 - 20 years15 - 20 years15 - 20 years> 20 years15 - 20 years5 - 10 years10 - 15 years' to numeric

markdownContect = """
# Data Understanding and Summary

## Dataset Overview 
- Number of Rows: {}
- Number of Colomns: {}

## Colomn Names
-{}

## Data Types
-{}

## Missing Data
-{}

## Statistics for 'Urban population living in slums (% of urban population)' colomn
-{}

## Statistics for Normalized Urban Population in Slums' column
- {}

## Country data
{}
