<a href="https://colab.research.google.com/github/yellowgram1543/6-Stages-of-AIML/blob/main/AIML0_Day5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Pandas Summarizing and Computing Descriptive Statistics

In [1]:
import pandas as pd
import numpy as np

# Create sample DataFrame with numeric data
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
})
print(df)
print("\n")

# Calculate sum of each column
column_sums = df.sum()
# Compute sum for each numeric column
print(column_sums)
print("\n")

# Calculate mean of each column
column_means = df.mean()
# Compute arithmetic mean for each numeric column
print(column_means)
print("\n")

# Calculate median of each column
column_medians = df.median()
# Compute median (50th percentile) for each numeric column
print(column_medians)
print("\n")

# Calculate standard deviation of each column
column_std = df.std()
# Compute sample standard deviation for each numeric column
print(column_std)
print("\n")

   A   B    C
0  1  10  100
1  2  20  200
2  3  30  300
3  4  40  400
4  5  50  500


A      15
B     150
C    1500
dtype: int64


A      3.0
B     30.0
C    300.0
dtype: float64


A      3.0
B     30.0
C    300.0
dtype: float64


A      1.581139
B     15.811388
C    158.113883
dtype: float64




**Comprehensive Descriptive Statistics**

In [2]:
# Generate comprehensive summary statistics
summary_stats = df.describe()
# Get count, mean, std, min, 25%, 50%, 75%, max for numeric columns
print(summary_stats)
print("\n")

# Include all data types in describe
df_mixed = pd.DataFrame({
    'numeric': [1, 2, 3, 4, 5],
    'categorical': ['A', 'B', 'A', 'C', 'B'],
    'boolean': [True, False, True, True, False]
})
print(df_mixed)
print("\n")

all_stats = df_mixed.describe(include='all')
# Include statistics for all data types including categorical
print(all_stats)
print("\n")

# Describe only categorical columns
cat_stats = df_mixed.describe(include=['object', 'bool'])
# Get statistics specifically for non-numeric columns
print(cat_stats)
print("\n")

              A          B           C
count  5.000000   5.000000    5.000000
mean   3.000000  30.000000  300.000000
std    1.581139  15.811388  158.113883
min    1.000000  10.000000  100.000000
25%    2.000000  20.000000  200.000000
50%    3.000000  30.000000  300.000000
75%    4.000000  40.000000  400.000000
max    5.000000  50.000000  500.000000


   numeric categorical  boolean
0        1           A     True
1        2           B    False
2        3           A     True
3        4           C     True
4        5           B    False


         numeric categorical boolean
count   5.000000           5       5
unique       NaN           3       2
top          NaN           A    True
freq         NaN           2       3
mean    3.000000         NaN     NaN
std     1.581139         NaN     NaN
min     1.000000         NaN     NaN
25%     2.000000         NaN     NaN
50%     3.000000         NaN     NaN
75%     4.000000         NaN     NaN
max     5.000000         NaN     NaN


       

**Advanced Statistical Measures**

In [3]:
# Create DataFrame with more varied data
df_advanced = pd.DataFrame({
    'values': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'weights': [0.1, 0.2, 0.1, 0.3, 0.1, 0.05, 0.05, 0.05, 0.03, 0.02]
})
print(df_advanced)
print("\n")

# Calculate variance
variance = df_advanced['values'].var()
# Compute sample variance of the values column
print(variance)
print("\n")

# Calculate skewness (measure of asymmetry)
skewness = df_advanced['values'].skew()
# Compute skewness to measure distribution asymmetry
print(skewness)
print("\n")

# Calculate kurtosis (measure of tail heaviness)
kurtosis = df_advanced['values'].kurt()
# Compute kurtosis to measure distribution tail weight
print(kurtosis)
print("\n")

# Calculate quantiles
quantiles = df_advanced['values'].quantile([0.25, 0.5, 0.75, 0.9])
# Compute specific quantiles (25th, 50th, 75th, 90th percentiles)
print(quantiles)
print("\n")

   values  weights
0       1     0.10
1       2     0.20
2       3     0.10
3       4     0.30
4       5     0.10
5       6     0.05
6       7     0.05
7       8     0.05
8       9     0.03
9      10     0.02


9.166666666666666


0.0


-1.2000000000000002


0.25    3.25
0.50    5.50
0.75    7.75
0.90    9.10
Name: values, dtype: float64




**Correlation and Covariance**

In [4]:
# Create DataFrame with correlated variables
np.random.seed(42)
df_corr = pd.DataFrame({
    'X': np.random.randn(100),
    'Y': np.random.randn(100),
    'Z': np.random.randn(100)
})
print(df_corr)
print("\n")

# Add some correlation between X and Y
df_corr['Y'] = df_corr['X'] * 0.5 + df_corr['Y'] * 0.5

# Calculate correlation matrix
correlation_matrix = df_corr.corr()
# Compute pairwise correlation coefficients between columns
print(correlation_matrix)
print("\n")

# Calculate covariance matrix
covariance_matrix = df_corr.cov()
# Compute pairwise covariance between columns
print(covariance_matrix)
print("\n")

# Calculate correlation with specific method
spearman_corr = df_corr.corr(method='spearman')
# Compute Spearman rank correlation instead of Pearson
print(spearman_corr)
print("\n")

           X         Y         Z
0   0.496714 -1.415371  0.357787
1  -0.138264 -0.420645  0.560785
2   0.647689 -0.342715  1.083051
3   1.523030 -0.802277  1.053802
4  -0.234153 -0.161286 -1.377669
..       ...       ...       ...
95 -1.463515  0.385317 -0.692910
96  0.296120 -0.883857  0.899600
97  0.261055  0.153725  0.307300
98  0.005113  0.058209  0.812862
99 -0.234587 -1.142970  0.629629

[100 rows x 3 columns]


          X         Y         Z
X  1.000000  0.635724  0.190840
Y  0.635724  1.000000  0.113064
Z  0.190840  0.113064  1.000000


          X         Y         Z
X  0.824770  0.353308  0.187922
Y  0.353308  0.374487  0.075022
Z  0.187922  0.075022  1.175669


          X         Y         Z
X  1.000000  0.599640  0.176694
Y  0.599640  1.000000  0.130753
Z  0.176694  0.130753  1.000000




**Handling Missing Values in Statistics**

In [5]:
# Create DataFrame with missing values
df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, 30, np.nan, 50],
    'C': [100, 200, 300, 400, np.nan]
})
print(df_missing)
print("\n")

# Calculate mean ignoring NaN values (default behavior)
mean_with_na = df_missing.mean()
# Compute mean while automatically skipping missing values
print(mean_with_na)
print("\n")

# Calculate mean treating NaN as zero
mean_fill_na = df_missing.fillna(0).mean()
# Fill missing values with zero before calculating mean
print(mean_fill_na)
print("\n")

# Count non-null values in each column
non_null_counts = df_missing.count()
# Count actual non-missing values per column
print(non_null_counts)
print("\n")

# Check for any missing values
has_missing = df_missing.isnull().any()
# Identify columns that contain at least one missing value
print(has_missing)
print("\n")

     A     B      C
0  1.0  10.0  100.0
1  2.0   NaN  200.0
2  NaN  30.0  300.0
3  4.0   NaN  400.0
4  5.0  50.0    NaN


A      3.0
B     30.0
C    250.0
dtype: float64


A      2.4
B     18.0
C    200.0
dtype: float64


A    4
B    3
C    4
dtype: int64


A    True
B    True
C    True
dtype: bool




**Group-wise Statistics**

In [6]:
# Create DataFrame with groups
df_groups = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'C', 'C', 'A', 'B'],
    'value1': [10, 15, 20, 25, 30, 35, 12, 22],
    'value2': [100, 150, 200, 250, 300, 350, 120, 220]
})
print(df_groups)
print("\n")

# Calculate group-wise means
group_means = df_groups.groupby('category').mean()
# Compute mean for each numeric column within each category group
print(group_means)
print("\n")

# Calculate multiple statistics per group
group_stats = df_groups.groupby('category').agg({
    'value1': ['mean', 'std', 'count'],
    'value2': ['min', 'max', 'median']
})
# Apply multiple aggregation functions to different columns per group
print(group_stats)
print("\n")

# Calculate overall statistics by group
group_describe = df_groups.groupby('category').describe()
# Get comprehensive statistics for each group separately
print(group_describe)
print("\n")

  category  value1  value2
0        A      10     100
1        A      15     150
2        B      20     200
3        B      25     250
4        C      30     300
5        C      35     350
6        A      12     120
7        B      22     220


             value1      value2
category                       
A         12.333333  123.333333
B         22.333333  223.333333
C         32.500000  325.000000


             value1                 value2            
               mean       std count    min  max median
category                                              
A         12.333333  2.516611     3    100  150  120.0
B         22.333333  2.516611     3    200  250  220.0
C         32.500000  3.535534     2    300  350  325.0


         value1                                                      value2  \
          count       mean       std   min    25%   50%    75%   max  count   
category                                                                      
A           3.0  12.3333

**Cumulative Statistics**

In [7]:
# Create time series-like DataFrame
df_cumulative = pd.DataFrame({
    'sales': [100, 120, 130, 110, 140, 160, 150, 170],
    'expenses': [80, 85, 90, 88, 95, 100, 98, 105]
})
print(df_cumulative)
print("\n")

# Calculate cumulative sum
cumulative_sales = df_cumulative['sales'].cumsum()
# Compute running total of sales over time
print(cumulative_sales)
print("\n")

# Calculate cumulative maximum
cumulative_max = df_cumulative.cummax()
# Track running maximum for each column
print(cumulative_max)
print("\n")

# Calculate cumulative mean
cumulative_mean = df_cumulative.expanding().mean()
# Compute expanding mean (mean of all previous values including current)
print(cumulative_mean)
print("\n")

   sales  expenses
0    100        80
1    120        85
2    130        90
3    110        88
4    140        95
5    160       100
6    150        98
7    170       105


0     100
1     220
2     350
3     460
4     600
5     760
6     910
7    1080
Name: sales, dtype: int64


   sales  expenses
0    100        80
1    120        85
2    130        90
3    130        90
4    140        95
5    160       100
6    160       100
7    170       105


        sales   expenses
0  100.000000  80.000000
1  110.000000  82.500000
2  116.666667  85.000000
3  115.000000  85.750000
4  120.000000  87.600000
5  126.666667  89.666667
6  130.000000  90.857143
7  135.000000  92.625000




**Percentiles and Rank Statistics**

In [8]:
# Create DataFrame for percentile calculations
df_percentiles = pd.DataFrame({
    'scores': [85, 92, 78, 96, 88, 73, 91, 82, 89, 94]
})
print(df_percentiles)
print("\n")

# Calculate specific percentiles
percentile_90 = df_percentiles['scores'].quantile(0.9)
# Get 90th percentile score
print(percentile_90)
print("\n")

# Calculate rank of each value
ranks = df_percentiles['scores'].rank(method='average', ascending=False)
# Assign ranks with highest score getting rank 1
print(ranks)
print("\n")

# Calculate percentile rank
percentile_ranks = df_percentiles['scores'].rank(pct=True)
# Convert ranks to percentile ranks (0 to 1 scale)
print(percentile_ranks)
print("\n")

   scores
0      85
1      92
2      78
3      96
4      88
5      73
6      91
7      82
8      89
9      94


94.2


0     7.0
1     3.0
2     9.0
3     1.0
4     6.0
5    10.0
6     4.0
7     8.0
8     5.0
9     2.0
Name: scores, dtype: float64


0    0.4
1    0.8
2    0.2
3    1.0
4    0.5
5    0.1
6    0.7
7    0.3
8    0.6
9    0.9
Name: scores, dtype: float64




**Unique Values and Frequency Statistics**

In [9]:
# Create DataFrame with categorical and numeric data
df_freq = pd.DataFrame({
    'category': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C'],
    'status': ['active', 'inactive', 'active', 'active', 'inactive', 'active', 'pending', 'active']
})
print(df_freq)
print("\n")

# Count unique values in categorical column
value_counts = df_freq['category'].value_counts()
# Get frequency count of each unique category
print(value_counts)
print("\n")

# Get unique values only
unique_values = df_freq['status'].unique()
# Extract array of unique status values
print(unique_values)
print("\n")

# Count unique values per column
nunique_counts = df_freq.nunique()
# Count number of unique values in each column
print(nunique_counts)
print("\n")

# Cross-tabulation of two categorical variables
crosstab = pd.crosstab(df_freq['category'], df_freq['status'])
# Create contingency table showing frequency of category-status combinations
print(crosstab)
print("\n")

  category    status
0        A    active
1        B  inactive
2        A    active
3        C    active
4        B  inactive
5        A    active
6        C   pending
7        C    active


category
A    3
C    3
B    2
Name: count, dtype: int64


['active' 'inactive' 'pending']


category    3
status      3
dtype: int64


status    active  inactive  pending
category                           
A              3         0        0
B              0         2        0
C              2         0        1




**Memory and Data Type Statistics**

In [10]:
# Create DataFrame with mixed data types
df_memory = pd.DataFrame({
    'int_col': [1, 2, 3, 4, 5],
    'float_col': [1.1, 2.2, 3.3, 4.4, 5.5],
    'str_col': ['a', 'b', 'c', 'd', 'e'],
    'bool_col': [True, False, True, False, True]
})
print(df_memory)
print("\n")

# Get memory usage of DataFrame
memory_usage = df_memory.memory_usage(deep=True)
# Calculate actual memory consumption including object overhead
print(memory_usage)
print("\n")

# Get data types of each column
data_types = df_memory.dtypes
# Display data type of each column
print(data_types)
print("\n")

# Get information about DataFrame structure
df_info = df_memory.info()
# Display comprehensive DataFrame information including memory usage
print("\n")

   int_col  float_col str_col  bool_col
0        1        1.1       a      True
1        2        2.2       b     False
2        3        3.3       c      True
3        4        4.4       d     False
4        5        5.5       e      True


Index        132
int_col       40
float_col     40
str_col      250
bool_col       5
dtype: int64


int_col        int64
float_col    float64
str_col       object
bool_col        bool
dtype: object


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   int_col    5 non-null      int64  
 1   float_col  5 non-null      float64
 2   str_col    5 non-null      object 
 3   bool_col   5 non-null      bool   
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 257.0+ bytes




**Custom Statistical Functions**

In [12]:
# Define custom statistical functions
def coefficient_of_variation(series):
    """Calculate coefficient of variation (std/mean)"""
    return series.std() / series.mean() if series.mean() != 0 else np.nan

def iqr(series):
    """Calculate interquartile range"""
    return series.quantile(0.75) - series.quantile(0.25)

# Apply custom functions
df_custom = pd.DataFrame({
    'group1': [10, 12, 11, 13, 14],
    'group2': [20, 25, 18, 22, 24]
})

cv_stats = df_custom.apply(coefficient_of_variation)
# Apply custom coefficient of variation function to each column
print(cv_stats)
print("\n")

iqr_stats = df_custom.apply(iqr)
# Apply custom interquartile range function to each column
print(iqr_stats)
print("\n")

# Combine built-in and custom functions
combined_stats = df_custom.agg({
    'group1': ['mean', 'std', coefficient_of_variation],
    'group2': ['median', iqr] # Use the function object instead of the string 'iqr'
})
# Mix built-in and custom functions in aggregation
print(combined_stats)
print("\n")

group1    0.131762
group2    0.131356
dtype: float64


group1    2.0
group2    4.0
dtype: float64


                             group1  group2
mean                      12.000000     NaN
std                        1.581139     NaN
coefficient_of_variation   0.131762     NaN
median                          NaN    22.0
iqr                             NaN     4.0


