In [None]:

import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats

file_path = r"C:\Users\ttder\OneDrive\Desktop\Data\Ricardian.dta"

# Print the file path to ensure it is being interpreted correctly
print(f"Checking the existence of: {file_path}")

# Check if the file exists
path = Path(file_path)
if path.is_file():
    print(f"{file_path} file exists.")
    # Load the Stata file into a pandas DataFrame
    df = pd.read_stata(file_path)   
else:
    print(f"{file_path} does not exist.")

# Select the specified columns
df = df[['springtemperature', 'summertemperature', 'falltemperature', 'summerprecipitation', 'fallprecipitation',
         'distancetomarketkm', 'livestockowner', 'education', 'distancetomarketkm', 'hhsize', 'netrevenue']]

# Inspect the first few rows of the DataFrame
print(df.head())

# Check null values and data types
df.info()

# statistical summary 
df.describe()

# Check for missing values
df.isna().sum()
# Fill missing values with the median
df.fillna(df.median(), inplace=True)

#Remove duplicate rows
df.drop_duplicates(inplace=True)

# Define function to detect outliers using IQR method : IQR: Identifies outliers based on percentile range.
def detect_outliers_iqr(df):
    outliers = pd.DataFrame(False, index=df.index, columns=df.columns)
    
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers_col = (df[col] < lower_bound) | (df[col] > upper_bound)
        outliers[col] = outliers_col
    
    return outliers

# Right way to corect outliers 
"""
 def correct_outliers(df, method='median'):
    # Detect outliers
    outliers = detect_outliers_iqr(df)
    
    for col in df.select_dtypes(include=[np.number]).columns:
        if method == 'median':
            median = df[col].median()
            df.loc[outliers[col], col] = median
        elif method == 'mean':
            mean = df[col].mean()
            df.loc[outliers[col], col] = mean
        elif method == 'cap':
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df[col] = np.clip(df[col], lower_bound, upper_bound)
        elif method == 'remove':
            df = df[~outliers[col]]
    
    return df

# Fill missing values with the median
df.fillna(df.median(), inplace=True)

# Convert all columns to integers
df = df.astype(int)

# Correct outliers using median replacement
corrected_df = correct_outliers(df, method='median')
print("Corrected DataFrame:")
print(corrected_df)
"""

#*** caution!!** removing data is not advisable at any cost, better replace it with mean or median. I am removing outliers because of my data issues

# Define function to remove outliers
def remove_outliers(df):
    outliers = detect_outliers_iqr(df)
    df_no_outliers = df[~outliers.any(axis=1)]
    return df_no_outliers

# Detect and print outliers
outliers = detect_outliers_iqr(df)
print("Outliers detected using IQR method:")
print(outliers)
print("\nDataFrame with outliers:")
print(df[outliers.any(axis=1)])

# Remove outliers
df_no_outliers = remove_outliers(df)
print("\nDataFrame after removing outliers:")
print(df_no_outliers)

# Alternatively can use Z-score to determine outliers 
# Calculate Z-scores :  Z-score: Identifies outliers by standard deviations.
numerical_columns = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(stats.zscore(df[numerical_columns]))
# Determine outliers
z_outliers = (z_scores > 3).any(axis=1)
print("Number of outliers detected using Z-score:", z_outliers.sum())

# Optional: Display rows with outliers
print(df[z_outliers])
# Remove outliers
# Remove rows with outliers
df_cleaned = df[~z_outliers]
print(df_cleaned.describe())

# Rename back 
df = df_cleaned
