## Outlier Analysis

In [None]:
#importing the basic libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
rich_df = pd.read_excel("richpeople.xlsx")
rich_df.columns

In [None]:
rich_df.shape

In [None]:
rich_df = rich_df[rich_df['year'] == 2014]
rich_df.head(2)

In [None]:
rich_df.isna().sum()

In [None]:
#remove missing values and then describe
rich_df = rich_df.dropna(subset=['age', 'networthusbillion', 'foundingdate'])
rich_df.describe()

In [None]:
rich_df.boxplot(column='networthusbillion', return_type='axes')

In [None]:
rich_df.boxplot(column='age', return_type='axes')

In [None]:
rich_df['citizenship'].value_counts()

In [None]:
rich_df.sort_values(by='networthusbillion', ascending=False).head(5)

In [None]:
rich_df['age'].describe()

In [None]:
rich_df.sort_values(by='age', ascending=False).head(3)

In [None]:
#some of the old people are rich.. let's check their standard deviation of ages
rich_df['agestd'] = ((rich_df['age'] - rich_df['age'].mean()).apply(abs) / rich_df['age'].std())
rich_df.sort_values(by='age', ascending=False).head(15)

In [None]:
#wealth standard dev
rich_df['wealth_stdev'] = ((rich_df['networthusbillion'] - rich_df['networthusbillion'].mean()).apply(abs) / rich_df['networthusbillion'].std())
rich_df.sort_values(by='networthusbillion', ascending=False).head(10)

In [None]:
#plot the histogram to find out
rich_df['networthusbillion'].hist()

In [None]:
rich_df.boxplot(column='networthusbillion', return_type='axes')

In [None]:
# Create a figure and two subplots in one axis
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Plot histogram for 'networthusbillion' column (All) with some more bins
rich_df['networthusbillion'].hist(ax=axs[0],bins=25)
axs[0].set_title('Net Worth Histogram (All)')

rich2 = rich_df[rich_df['networthusbillion'] > 50] # filter out people with greater worth - outlying cases
print(rich2['networthusbillion'])

# Plot histogram for net worth greater than $50 billion
rich_df[rich_df['networthusbillion'] > 50]['networthusbillion'].hist(ax=axs[1])
axs[1].set_title('Net Worth Histogram (>$50 Billion)')

plt.tight_layout()
plt.show()


In [None]:
rich_df['networthusbillion'].describe()

In [None]:
rich_df['age'].hist(bins=30)

In [None]:
rich_df.plot(x='age',y='networthusbillion',kind='scatter',alpha=0.5,cmap='rainbow')

In [None]:
# Improved scatter plot for better readability
plt.scatter(rich_df['age'], rich_df['networthusbillion'], alpha=0.5, c=rich_df['networthusbillion'], cmap='coolwarm')
plt.xlabel("Age")
plt.ylabel("Net Worth (Billion USD)")
plt.title("Age vs Net Worth")
plt.colorbar(label="Net Worth")
plt.show()

# Detection of Outliers using Visuals

In [None]:
rich_df.boxplot(column='networthusbillion', return_type='axes')

# Detection of Outliers using Z-score Method

In [None]:
from scipy.stats import zscore

# Compute Z-scores
rich_df['wealth_zscore'] = zscore(rich_df['networthusbillion'])

# Identify outliers (Z-score > 3 or < -3)
z_outliers = rich_df[(rich_df['wealth_zscore'] > 3) | (rich_df['wealth_zscore'] < -3)]
print("Z-score Outliers:")
print(z_outliers[['name', 'networthusbillion', 'wealth_zscore']])

# Detection of Outliers using IQR Method

In [None]:
# Compute IQR
Q1 = rich_df['networthusbillion'].quantile(0.25)
Q3 = rich_df['networthusbillion'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Identify outliers using IQR
iqr_outliers = rich_df[(rich_df['networthusbillion'] < lower_bound) | (rich_df['networthusbillion'] > upper_bound)]
print("IQR Outliers:")
print(iqr_outliers[['name', 'networthusbillion']])

# Detection of Outliers using Tietjen-Moore Test

In [None]:
# The Tietjen-Moore Test is not widely available as a built-in function in most common Python libraries like Statsmodels or Scipy for general statistical analysis. 
# While it is a well-known method for detecting outliers, it isn't as commonly implemented as other tests (like Grubbs' test or ESD test) for outlier detection.
# However, you can implement the Tietjen-Moore Test yourself using custom code.ence for outlier detection.

# Look up Grubbs and ESD (Extreme Studentized Deviate) Tests and test them out.