**Import Libraries and Data**

In [1]:
!pip install pandas



In [None]:
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_excel('Sample - Superstore_table.xlsx')
df.head()

**Generate Summary Statistics That Describe Variable's Numeric Value​**

In [None]:
# Generate summary statistics (count, mean, std, min, quartiles, max) for 'Sales' column
summary = df['Sales'].describe()
print(summary)

In [None]:
# Calculate key central tendency metrics: mean, median, mode for 'Sales'
mean_value = df['Sales'].mean()
median = df['Sales'].median()
mode = df['Sales'].mode()

# Print mean, median, and mode values
print(f"Mean: {mean_value}")
print(f"Median: {median}")
print(f"Mode: {mode}")

**Generate Summary Statistics That Describe Variable Distribution​**

In [None]:
# Calculate metrics related to the distribution: standard deviation, variance, skewness, quartiles
std_deviation = df['Sales'].std()
variance_value = df['Sales'].var()
skewness_value = df['Sales'].skew()

# Calculate the 1st and 3rd quartiles (25th and 75th percentiles)
quartiles = df['Sales'].quantile([0.25, 0.75])

# Print the calculated statistics
print(f"Standard Deviation: {std_deviation}")
print(f"Variance: {variance_value}")
print(f"Skewness: {skewness_value}")
print(f"Quartiles: {quartiles}")

**Data Visualization**

In [None]:
# Import necessary visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Create and display a histogram for 'Sales'
plt.hist(df['Sales'], bins=20)
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.title('Histogram of Sales')
plt.show()

In [None]:
# Create a box plot to visually display the distribution and outliers of 'Sales'
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Sales'])
plt.xlabel('Sales')
plt.title('Box Plot of Sales')
plt.show()

**Create Crosstab Table**

In [None]:
# Create a crosstab (contingency table) to show the relationship between 'Category' and 'Region'
crosstab_table = pd.crosstab(df['Category'], df['Region'])
print(crosstab_table)

In [None]:
# Display the crosstab in a more readable format
from IPython.display import display
display(crosstab_table)

**Summarizing Categorical Data​**

In [None]:
# Generate frequency counts for the 'Category' column
category_counts = df['Category'].value_counts()
print(category_counts)

In [None]:
# Create a frequency table by resetting the index and renaming columns
frequency_table = df['Category'].value_counts().reset_index()
frequency_table.columns = ['Category', 'Count']

# Print the frequency table
print(frequency_table)

In [None]:
# Calculate the mode (most frequent category) for 'Category'
category_mode = df['Category'].mode()
print(category_mode)

In [None]:
# Pie chart to show the proportion of each category
plt.pie(frequency_table['Count'],
        labels=frequency_table['Category'],
        autopct='%1.1f%%')
plt.title('Proportion of Categories')
plt.show()

In [None]:
# Bar chart to display frequency distribution of 'Category'
plt.bar(frequency_table['Category'], frequency_table['Count'])
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Frequency Distribution of Categories')
plt.show()

**Transforming Variables into Categorical Data Type​**

In [None]:
# Print data types of all columns
data_types = df.dtypes
print(data_types)

In [None]:
# Get general info about the DataFrame including data types and missing values
df.info()

In [None]:
# Check current data type of 'Category'
print(df['Category'].dtype)

In [None]:
# Convert 'Category' column to categorical data type
df['Category'] = df['Category'].astype('category')

# Verify the new data type of 'Category'
print(df['Category'].dtype)

**Encode Categorical Data into Numeric Quantities Using**

**Replace Value**

In [None]:
# Replace categorical values in 'Category' with numeric codes
replace_value = df['Category'].replace({'Office Supplies': 1, 'Furniture': 2, 'Technology': 3})

# Print the updated column with numeric codes
print(replace_value)

**Encoding Labels**

In [None]:
# Show 'Ship Mode' column before label encoding
print(df['Ship Mode'])  # before encoding

In [None]:
!pip install scikit-learn

In [None]:
# Perform label encoding to convert 'Ship Mode' into numeric labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoding_label = df['Ship Mode'] = le.fit_transform(df['Ship Mode'])

# Print encoded values of 'Ship Mode'
print(encoding_label)  # after encoding

**One-Hot Encoding:**



*   Create binary columns for each category, where 1 indicates the presence of the category and 0 indicates absence.
*   Use when there is no inherent order among categories.

In [None]:
# Perform one-hot encoding on 'Segment' column to create binary indicator variables for each category
one_hot_encoding = pd.get_dummies(df, columns=['Segment'])
print(one_hot_encoding)

**Binary Encoding:**

*   Convert each category to binary code and create separate columns for each bit.
*   Useful for reducing dimensionality compared to one-hot encoding.

In [None]:
!pip install category_encoders

In [None]:
# Show 'Category' column before binary encoding
print(df['Category'])

In [None]:
# Perform binary encoding on 'Category' to reduce dimensionality compared to one-hot encoding
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['Category'])
df_encoder = encoder.fit_transform(df['Category'])

# Print binary encoded data
print(df_encoder)

**Backward Difference Encoding:**



*  Encode categorical levels based on the difference between the current level and the previous one.
*  Useful when there is some meaningful sequential order among categories.

In [None]:
df['Ship Mode'] #before

In [None]:
# Perform backward difference encoding on 'Ship Mode' where categories have some order
encoder = ce.BackwardDifferenceEncoder(cols=['Ship Mode'])
df_backward = encoder.fit_transform(df['Ship Mode'])

# Print the result of backward difference encoding
print(df_backward)