In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)


In [None]:
# Load data
df = pd.read_csv("csv/data.csv", sep=',')

In [None]:
# Checking the data type
df.dtypes

In [None]:
# Drop irrelevant columns
df = df.drop(['Engine Fuel Type', 'Market Category', 'Vehicle Style', 'Popularity', 'Number of Doors', 'Vehicle Size'], axis=1)
df.head(5)

In [None]:
# renaming the columns
df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode", "highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price"})
df.head(5)

In [None]:
# Totals number of rows and columns
df.shape

In [None]:
# Rows containing duplicate data
duplicate_rows_df = df[df.duplicated()]
print("Number of duplicate rows: ", duplicate_rows_df.shape)

In [None]:
# Count the number of rows 
df.count()

In [None]:
# Dropping the duplicates, count the number of rows
df = df.drop_duplicates()
df.count()

In [None]:
# Finding null values
print(df.isnull().sum())

In [None]:
# Dropping missing values
df = df.dropna()
df.count()

In [None]:
# Detect outliers
sns.boxplot(x=df['Price'])

In [None]:
# Detect outliers
sns.boxplot(x=df['HP'])

In [None]:
# Detect outliers
sns.boxplot(x=df['Cylinders'])

In [None]:
# Find IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 -Q1
print(IQR)

In [None]:
# remove outliers
df = df[~((df < (Q1-1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

In [None]:
# Plotting a Histogram
df.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of cars by make")
plt.ylabel('Number of cars')
plt.xlabel('Make')

In [None]:
# Finding the relations between the variables
plt.figure(figsize=(11,5))
c = df.corr()
sns.heatmap(c, cmap="BrBG", annot=True)
c

In [None]:
# Plotting a scatter plot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()