<a href="https://colab.research.google.com/github/thepersonuadmire/ML_EDA/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EDA - 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("bike_details.csv")

# 1. Range of selling prices
price_range = df['selling_price'].max() - df['selling_price'].min()

# 2. Median selling price
median_price = df['selling_price'].median()

# 3. Most common seller type
common_seller = df['seller_type'].mode()[0]

# 4. Count of bikes driven more than 50,000 km
high_mileage_bikes = df[df['km_driven'] > 50000].shape[0]

# 5. Average km_driven per ownership type
avg_km_by_owner = df.groupby('owner')['km_driven'].mean()

# 6. Proportion of bikes from 2015 or older
older_bikes = (df[df['year'] <= 2015].shape[0]) / df.shape[0]

# 7. Trend of missing values
missing_values = df.isnull().sum()

# 8. Highest ex-showroom price and corresponding bike
max_ex_showroom = df.loc[df['ex_showroom_price'].idxmax(), ['name', 'ex_showroom_price']]

# 9. Total bikes listed by each seller type
bikes_by_seller = df['seller_type'].value_counts()

# 10. Relationship between selling price and km_driven for first-owner bikes
sns.scatterplot(data=df[df['owner'] == '1st owner'], x='km_driven', y='selling_price')

# 11. Remove outliers in km_driven using IQR
Q1 = df['km_driven'].quantile(0.25)
Q3 = df['km_driven'].quantile(0.75)
IQR = Q3 - Q1
df_cleaned = df[(df['km_driven'] >= (Q1 - 1.5 * IQR)) & (df['km_driven'] <= (Q3 + 1.5 * IQR))]

# 12. Bivariate analysis of year vs. selling_price
sns.scatterplot(data=df, x='year', y='selling_price')

# 13. Average depreciation in selling price based on age
df['bike_age'] = 2024 - df['year']
depreciation = df.groupby('bike_age')['selling_price'].mean()

# 14. Bikes priced significantly above the average for their manufacturing year
year_avg_price = df.groupby('year')['selling_price'].mean()
df['above_avg'] = df.apply(lambda row: row['selling_price'] > year_avg_price[row['year']], axis=1)
above_avg_bikes = df[df['above_avg']][['name', 'selling_price', 'year']]

# 15. Correlation matrix and heatmap
correlation_matrix = df[['selling_price', 'km_driven', 'ex_showroom_price', 'year']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.show()


EDA - 2

In [None]:
df = pd.read_csv("car_sales.csv")

# 1. Average selling price per dealer
avg_price_per_dealer = df.groupby('Dealer_Name')['Price ($)'].mean()

# 2. Car brand with highest price variation
price_variation = df.groupby('Company')['Price ($)'].std().idxmax()

# 3. Distribution of car prices by transmission type
sns.boxplot(x='Transmission', y='Price ($)', data=df)

# 4. Distribution of car prices across regions
sns.boxplot(x='Dealer_Region', y='Price ($)', data=df)

# 5. Distribution of cars by body styles
body_style_distribution = df['Body Style'].value_counts()

# 6. Variation of price by gender and income
sns.boxplot(x='Gender', y='Price ($)', hue='Annual Income', data=df)

# 7. Distribution of car prices by region
sns.boxplot(x='Dealer_Region', y='Price ($)', data=df)

# 8. Car prices by engine sizes
sns.boxplot(x='Engine', y='Price ($)', data=df)

# 9. Car price vs. customer income bracket
sns.scatterplot(x='Annual Income', y='Price ($)', data=df)

# 10. Top 5 most sold car models
top_models = df['Model'].value_counts().head(5)

# 11. Car price variation by engine size and color
sns.boxplot(x='Color', y='Price ($)', hue='Engine', data=df)

# 12. Seasonal trend in car sales
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
sns.lineplot(x='Month', y='Price ($)', data=df)

# 13. Car price by body style & transmission type
sns.boxplot(x='Body Style', y='Price ($)', hue='Transmission', data=df)

# 14. Correlation matrix for price, engine size, and income
sns.heatmap(df[['Price ($)', 'Engine', 'Annual Income']].corr(), annot=True)

# 15. Average price across car models & engine types
avg_price_by_model_engine = df.groupby(['Model', 'Engine'])['Price ($)'].mean()


EDA - 3

In [None]:
df = pd.read_csv("amazon_sales.csv")

# 1. Average rating for each category
avg_rating_per_category = df.groupby('category')['rating'].mean()

# 2. Top rating_count products by category
top_rated_products = df.groupby('category')['rating_count'].idxmax()

# 3. Distribution of discounted vs. actual prices
sns.histplot(df['discounted_price'], bins=30, color='blue', label='Discounted')
sns.histplot(df['actual_price'], bins=30, color='red', label='Actual', alpha=0.5)
plt.legend()

# 4. Average discount percentage by category
avg_discount_per_category = df.groupby('category')['discount_percentage'].mean()

# 5. Most popular product names
popular_products = df['product_name'].value_counts().head(10)

# 6. Most popular keywords in product descriptions
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
words = " ".join(df['about_product'].dropna()).lower().split()
filtered_words = [word for word in words if word not in stopwords.words('english')]
common_words = Counter(filtered_words).most_common(10)

# 7. Most popular product reviews
popular_reviews = df['review_content'].value_counts().head(10)

# 8. Correlation between discounted_price and rating
correlation = df[['discounted_price', 'rating']].corr()

# 9. Top 5 categories by highest ratings
top_categories_by_rating = df.groupby('category')['rating'].mean().nlargest(5)

# 10. Identify areas for improvement
missing_values = df.isnull().sum()


EDA - 4

In [None]:
df = pd.read_csv("spotify_tracks.csv")

# 1. Check for null values & duplicates
df.drop_duplicates(inplace=True)
df.fillna(method='ffill', inplace=True)

# 2. Popularity distribution
sns.histplot(df['Popularity'])

# 3. Popularity vs. Duration
sns.scatterplot(x='Duration (ms)', y='Popularity', data=df)

# 4. Artist with highest track count
top_artists = df['Artist'].value_counts().head(5)

# 5. Top 5 least popular tracks
least_popular_tracks = df.nsmallest(5, 'Popularity')[['Artist', 'Track Name']]

# 6. Most popular artist by average popularity
most_popular_artist = df.groupby('Artist')['Popularity'].mean().idxmax()

# 7. Most popular tracks for top artists
most_popular_tracks = df.groupby('Artist')['Popularity'].idxmax()

# 8. Pair plot
sns.pairplot(df)

# 9. Duration variation across artists (box plot)
sns.boxplot(x='Artist', y='Duration (ms)', data=df)

# 10. Popularity distribution by artist (violin plot)
sns.violinplot(x='Artist', y='Popularity', data=df)
