In [None]:
import pandas as pd
import numpy as np
import io

# Create DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [22, 27, 22, 32],
    'Salary': [50000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)
print("DataFrame:")
print(df)

# Series example
series_example = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
print("\nSeries Example:")
print(series_example)

# DataFrame example (column, row, cell)
print("\nDataFrame Example:")
print("Accessing 'Age' column:")
print(df['Age'])
print("\nAccessing row with index 1:")
print(df.loc[1])
print("\nAccessing specific cell (Name of row 2):")
print(df.at[2, 'Name'])

# Create sample CSV data in memory instead of reading from file
csv_data = """Name,Age,Salary,Experience
Alice,24,50000,2
Bob,27,60000,5
Charlie,22,55000,0
David,32,70000,10
Eve,29,80000,7
Frank,26,65000,3
Grace,28,75000,6
Henry,30,85000,9
Ian,31,90000,8
Jack,25,62000,1
Kathy,23,58000,2
Liam,34,95000,12
Mia,35,100000,15"""

# Read CSV from string (simulating file read)
df_csv = pd.read_csv(io.StringIO(csv_data))
print("\nDataFrame from CSV (head):")
print(df_csv.head())
print("\nDataFrame from CSV (tail):")
print(df_csv.tail())
print("\nDataFrame from CSV (info):")
print(df_csv.info())

# Set maximum number of rows to display
pd.set_option('display.max_rows', 10)
print("\nDataFrame with MAX_ROWS set to 10:")
print(df_csv)

# Statistical analysis
print("\nStatistical Summary:")
print(df_csv.describe())

# Average age
average_age = df_csv['Age'].mean()
print(f"\nAverage Age: {average_age:.2f}")

# Create a DataFrame with some missing values to demonstrate cleaning
data_with_na = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', ''],
    'Age': [22, 27, np.nan, 32, 29, 25],
    'Salary': [50000, 60000, 55000, np.nan, 80000, 65000]
}
df_na = pd.DataFrame(data_with_na)
print("\nDataFrame with missing values:")
print(df_na)

# Drop rows with NA values
df_cleaned = df_na.dropna()
print("\nCleaned DataFrame (dropped NA):")
print(df_cleaned)

# Fill NA in Salary with median
df_filled = df_na.copy()
df_filled['Salary'] = df_filled['Salary'].fillna(df_filled['Salary'].median())
print("\nDataFrame with filled NA in Salary:")
print(df_filled)

# Replace empty string with NaN
df_empty = df_na.replace('', np.nan)
print("\nDataFrame with empty cells replaced with NaN:")
print(df_empty)

# Replace NaN in Age with mean
df_mean = df_empty.copy()
mean_age = df_mean['Age'].mean()
df_mean['Age'] = df_mean['Age'].fillna(mean_age)
print(f"\nDataFrame with NaN in 'Age' replaced with mean ({mean_age:.2f}):")
print(df_mean)

# Replace NaN in Age with median
df_median = df_empty.copy()
median_age = df_median['Age'].median()
df_median['Age'] = df_median['Age'].fillna(median_age)
print(f"\nDataFrame with NaN in 'Age' replaced with median ({median_age}):")
print(df_median)

# Replace NaN in Age with mode
df_mode = df_empty.copy()
mode_age = df_mode['Age'].mode()
if not mode_age.empty:
    df_mode['Age'] = df_mode['Age'].fillna(mode_age[0])
    print(f"\nDataFrame with NaN in 'Age' replaced with mode ({mode_age[0]}):")
else:
    print("\nNo mode found for Age column (all values are unique)")
print(df_mode)

# Data type conversion
df_str_age = df.copy()
df_str_age['Age'] = df_str_age['Age'].astype(str)
print("\nDataFrame with Age as string:")
print(df_str_age)
print("Data types:")
print(df_str_age.dtypes)

# Filter rows where Age contains only digits (when Age is string)
df_corrected = df_str_age[df_str_age['Age'].str.isdigit()]
print("\nDataFrame with corrected Age data (digits only):")
print(df_corrected)

# Create duplicate data to demonstrate removal
df_with_duplicates = pd.concat([df, df.iloc[0:2]], ignore_index=True)
print("\nDataFrame with duplicates:")
print(df_with_duplicates)

# Remove duplicates
df_no_duplicates = df_with_duplicates.drop_duplicates()
print("\nDataFrame with duplicates removed:")
print(df_no_duplicates)

# Correlations (using the CSV data with more columns)
correlation = df_csv.select_dtypes(include=[np.number]).corr()
print("\nCorrelation matrix:")
print(correlation)

# Additional useful operations
print("\nGrouping by Age and calculating mean Salary:")
age_groups = df_csv.groupby('Age')['Salary'].mean()
print(age_groups)

print("\nFiltering employees with Salary > 70000:")
high_salary = df_csv[df_csv['Salary'] > 70000]
print(high_salary)

print("\nSorting by Salary (descending):")
sorted_df = df_csv.sort_values('Salary', ascending=False)
print(sorted_df.head())

# Save DataFrame to CSV
try:
    df_csv.to_csv('output.csv', index=False)
    print("\nDataFrame saved to 'output.csv'.")
except Exception as e:
    print(f"\nError saving file: {e}")
    print("File saving may not work in some environments (like GitHub preview)")

print("\n=== Pandas Tutorial Complete ===")
print("This notebook demonstrates:")
print("- Creating DataFrames and Series")
print("- Data access and manipulation")
print("- Handling missing values")
print("- Data cleaning and filtering")
print("- Statistical operations")
print("- Data type conversions")
print("- Grouping and sorting")
print("- Correlation analysis")