In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
pd.set_option('display.max_columns',None) #display all possible columns
for dirname, _, filenames in os.walk('../data'): 
    for filename in filenames:
        print(os.path.join(dirname, filename)) #list all files in the data directory

In [None]:
df=pd.read_csv('../data/clean_dataset.csv') #load data into dataframe
df.head(5) #display head (top 5 rows)

In [None]:
df.tail(5) #display tail (last 5 rows)

In [None]:
print(f"Shape: ",df.shape) #get total shape of dataset, total rows and columns
print("Number of Columns:", df.shape[1])
print("Number of Rows:", df.shape[0])

In [None]:
df.info() #quick info about data

In [None]:
df.describe() #statistics for numerical datatypes

In [None]:
df.describe().transpose() #statistics for numerical datatypes

In [12]:
df.drop('Unnamed: 0',axis=1, inplace = True) #drop unwanted column permanently

In [None]:
df.isna().sum() #number of missing values per column

In [None]:
print("Number of Duplicates: ", df.duplicated().sum())

In [None]:
df.nunique() #number of unique values in each column

In [None]:
df.columns #show all cloumns

In [None]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('Numerical Features : {} : {}'.format(len(numeric_features), numeric_features))
print('Categorical Features : {} : {}'.format(len(categorical_features), categorical_features))


In [None]:
#get unique values in categorical columns
for column in categorical_features:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

In [None]:
df.info() #quick info about data

In [None]:
df.describe() #statistics for numerical datatypes

In [None]:
#number of flights by airline
df1=df.groupby(['flight','airline'],as_index=False).count()
df1['airline'].value_counts()

In [None]:
#economy vs business class
df2=df.groupby(['flight','airline','class'],as_index=False).count()
df2['class'].value_counts()

In [None]:
#number of flights by source and destination
df.groupby(['flight','source_city','destination_city','airline','class'],as_index=False).count().groupby(['source_city','destination_city'],as_index=False)['flight'].count().head(10)

In [None]:
#average price by source and destination
df.groupby(['airline','source_city','destination_city'],as_index=False)['price'].mean().head(10)

In [None]:
economy_price_mean = round(df[df["class"] == "Economy"]["price"].mean(),0)
economy_price_median = round(df[df["class"] == "Economy"]["price"].median(),0)
business_price_mean = round(df[df["class"] == "Business"]["price"].mean(),0)
business_price_median = round(df[df["class"] == "Business"]["price"].median(),0)

print("Economy Class (Mean): ", economy_price_mean)
print("Economy Class (Median): ", economy_price_median)
print("Business Class (Mean): ", business_price_mean)
print("Business Class (Median): ", business_price_median)