<a href="https://colab.research.google.com/github/vikasyankanchi/0000/blob/main/week_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Sample raw data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Bob', None,'Alice'],
    'Age': [25, None, 35, 30, 22,25],
    'City': ['Delhi', 'Mumbai', 'Bangalore', 'Mumbai', 'Pune','Delhi']
}

df = pd.DataFrame(data)

# Show original data
print("Original Data:")
print(df)

# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Remove rows with missing names
df = df.dropna(subset=['Name'])

# 3. Fill missing age with the mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

print("\nCleaned Data:")
print(df)


In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

# Check structure and completeness
df.info()

# Summary statistics
df.describe(include='all')

# Check for duplicates
df.duplicated().sum()

In [None]:
import pandas as pd

data = {
    'ID': [1, 2, 2, 4, 5,2],
    'Age': [25, 30, 30, -5, 200,30],
    'Gender': ['Male', 'male', 'Male', 'F', 'Unknown','male'],
    'Salary': [4000, None, 4000, 5000, -3000,None]
}
df = pd.DataFrame(data)
print(df)

In [None]:
df.duplicated().sum()          # Count duplicates
df[df.duplicated()]

In [None]:
# Use describe to find extreme values
print(df['Age'].describe())

# Use boxplot for visual outliers
import matplotlib.pyplot as plt
df['Age'].plot.box()
plt.show()

In [None]:
# Check for inconsistent gender entries
print(df['Gender'].value_counts())

# Fix known inconsistencies
df['Gender'] = df['Gender'].replace({'male': 'Male', 'F': 'Female', 'Unknown': None})

# Invalid salary (e.g., negative)
print(df[df['Salary'] < 0])

In [None]:
df.isna().sum()

In [None]:
import pandas as pd

# Sample data with missing values
data = {
    'Name': ['Alice', 'Bob', None, 'Eve'],
    'Age': [25, None, 30, 22],
    'City': ['Delhi', 'Mumbai', 'Bangalore', None]
}

df = pd.DataFrame(data)

# Use .info() to check non-null values
df.info()

In [None]:
print(df.isna())

In [None]:
# Count missing values per column
print(df.isna().sum())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

data = pd.Series([160, 162, 158, 165, 161, 250])  # height in cm

# Boxplot
plt.boxplot(data)
plt.title("Univariate Outlier Detection - Boxplot")
plt.show()

# IQR method
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))]
print("Outliers:", outliers.values)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

height = np.array([160, 162, 158, 165, 161, 250])
weight = np.array([55, 60, 54, 63, 59, 80])

plt.scatter(height, weight)
plt.xlabel("Height (cm)")
plt.ylabel("Weight (kg)")
plt.title("Bivariate Outlier Detection - Scatter Plot")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

dates = pd.date_range(start='2023-01-01', periods=12, freq='M')
sales = pd.Series([200, 210, 205, 220, 215, 800, 225, 230, 240, 235,200, 245], index=dates)

sales.plot(marker='o', title="Time Series Outlier Detection")
plt.show()

# Simple detection: mean ± 3*std
mean = sales.mean()
std = sales.std()
outliers = sales[(sales > mean + 3*std) | (sales < mean - 3*std)]
print("Outliers in time series:\n", outliers)

In [None]:
import pandas as pd
data = pd.Series([5, 6, 7, 8, 100])  # Example data

# Capping at 5th and 95th percentiles
lower_cap = data.quantile(0.05)
upper_cap = data.quantile(0.95)

capped_data = data.clip(lower=lower_cap, upper=upper_cap)
print(capped_data)

In [None]:
import numpy as np
data = pd.Series([10, 12, 15, 20, 200])
log_data = np.log(data)
print(log_data)

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Keep only non-outliers
filtered_data = data[(data >= (Q1 - 1.5 * IQR)) & (data <= (Q3 + 1.5 * IQR))]
print(filtered_data)

In [None]:
import pandas as pd

# Dataset A: Customer basic info
data_a = {
    'Customer_ID': [101, 102, 103],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
df_a = pd.DataFrame(data_a)

# Dataset B: Customer address info
data_b = {
    'Customer_ID': [101, 102, 104],  # Note: ID 104 not in Dataset A
    'Address': ['New York', 'Los Angeles', 'Chicago']
}
df_b = pd.DataFrame(data_b)

print("Dataset A:")
print(df_a)
print("\nDataset B:")
print(df_b)

# ----- Adding Attributes (Merge Columns) -----
# Merge on Customer_ID to add Address to Dataset A
df_merge = pd.merge(df_a, df_b, on='Customer_ID', how='left')
print("\nAfter Adding Attributes (Address column):")
print(df_merge)

# ----- Adding Data Objects (Adding new rows) -----
# Find rows in Dataset B not in Dataset A
df_new_customers = df_b[~df_b['Customer_ID'].isin(df_a['Customer_ID'])]

# Create full info for new customers (for simplicity, fill missing columns)
df_new_customers = df_new_customers.assign(Name=['David'], Age=[28])

# Append the new rows
df_final = pd.concat([df_merge, df_new_customers], ignore_index=True)
print("\nAfter Adding Data Objects (new customers):")
print(df_final)

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# ---- Sample Dataset ----
data = {
    'Customer_ID': [1, 2, 3, 4, 5, 6],
    'Age': [25, 30, 45, 40, 35, 50],
    'Annual_Income': [30000, 40000, 50000, 60000, 55000, 65000],
    'Spending_Score': [60, 70, 80, 50, 65, 90]
}
df = pd.DataFrame(data)

print("Original Dataset:")
print(df)

# ----------------------------------------------------
# 1️⃣ NUMEROSITY DATA REDUCTION
# ----------------------------------------------------

# Example: Aggregation - Grouping customers into age groups
df['Age_Group'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60], labels=['20-30', '31-40', '41-50', '51-60'])
numerosity_reduced = df.groupby('Age_Group').agg({
    'Annual_Income': 'mean',
    'Spending_Score': 'mean'
}).reset_index()

print("\nNumerosity Reduction (Aggregation by Age Group):")
print(numerosity_reduced)

# ----------------------------------------------------
# 2️⃣ DIMENSIONALITY DATA REDUCTION
# ----------------------------------------------------

# Select only numeric columns for PCA
features = ['Age', 'Annual_Income', 'Spending_Score']
X = df[features]

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA to reduce from 3D → 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
dimensionality_reduced = pd.concat([df['Customer_ID'], pca_df], axis=1)

print("\nDimensionality Reduction (PCA - 2 Principal Components):")
print(dimensionality_reduced)

In [None]:
import pandas as pd
df = pd.read_csv("/content/train.csv", usecols=['annotation_id','annotator','audio'])
df

In [None]:
df.isnull().sum()

In [None]:
new_df = df.fillna(0)
new_df

In [None]:
new_df = df.fillna(method="ffill")
new_df

In [None]:
new_df = df.fillna(method="bfill")
new_df

In [None]:
new_df = df.interpolate()
df

In [None]:
new_df = df.dropna()
new_df

In [None]:
new_df = df.dropna(how='all')
new_df

In [None]:
import pandas as pd
# Read the CSV file without specifying usecols to inspect the columns
df_temp = pd.read_csv("/content/train.csv")
print(df_temp.columns)