# Data Transformation and Manipulation in Python
In this notebook, we will go through common Python syntaxes for data transformation, manipulation, and analysis using libraries like Pandas and NumPy.


In [None]:
import pandas as pd
import numpy as np

# Loading the dataset
df = pd.read_csv('your_dataset.csv')

# Quick Overview of the Dataset
df.head()  # Displays the first few rows of the dataset
df.info()  # Provides concise summary of the dataset
df.describe()  # Generates descriptive statistics


## 1. Handling Missing Data
Dealing with missing data is a key aspect of data preprocessing.

In [None]:
# Checking for missing values
df.isnull().sum()  # Summarize the number of missing values in each column

# Filling missing values
df.fillna(df.mean(), inplace=True)  # Fill NaN with mean for numerical columns
df.fillna(df.mode().iloc[0], inplace=True)  # Fill NaN with mode for categorical columns
# Dropping missing values
df.dropna(inplace=True)  # Drop rows with missing values


## 2. Data Transformation
Data transformation is a process of converting data from one format or structure to another.

In [None]:
# Applying transformations
df['log_column'] = np.log(df['column_name'])  # Apply log transformation

# Standardizing data (Z-score normalization)
df['standardized'] = (df['column_name'] - df['column_name'].mean()) / df['column_name'].std()

# Min-Max Scaling
df['scaled'] = (df['column_name'] - df['column_name'].min()) / (df['column_name'].max() - df['column_name'].min())


## 3. Feature Engineering
Creating new features to improve model performance.

In [None]:
# Creating new features
df['new_feature'] = df['existing_feature_1'] / df['existing_feature_2']  # Example: Ratio

# Encoding categorical variables
df = pd.get_dummies(df, columns=['categorical_column'])  # One-Hot Encoding


## 4. Grouping and Aggregation
Aggregate data based on certain categories or features.

In [None]:
# Grouping data and aggregating
grouped = df.groupby('category_column').agg({'numerical_column': 'mean'})  # Aggregate by mean
grouped


## 5. Data Filtering and Sorting
Select specific subsets of data.

In [None]:
# Filtering data based on conditions
filtered_data = df[df['column_name'] > threshold]  # Example: filter by a threshold

# Sorting data
sorted_data = df.sort_values(by='column_name', ascending=False)  # Sort values in descending order


## 6. Merging and Joining Datasets
Combine multiple datasets together.

In [None]:
# Merging two datasets
merged_df = pd.merge(df1, df2, on='common_column', how='inner')  # Inner join


## 7. Pivot Tables
Create a pivot table for summarizing and analyzing data.

In [None]:
# Creating a pivot table
pivot_table = df.pivot_table(index='category_column', values='numerical_column', aggfunc='mean')
pivot_table


## 8. Data Visualization
Visualizing data for insights.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizing correlation heatmap
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

# Scatter plot
plt.scatter(df['x_column'], df['y_column'])
plt.xlabel('X Label')
plt.ylabel('Y Label')
plt.title('Scatter Plot')
plt.show()
