In [None]:
# Libraries

import random
import math
import time
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load Data

df_read1 = pq.read_table('sim10k.pq').to_pandas()

In [None]:
df_read1.tail()

In [None]:
# Explore variable data types

df_read1.dtypes

In [None]:
df = df_read1

In [None]:
# Dimention of dataset

print('The shape of the dataset : ', df.shape)

In [None]:
df.head()

In [None]:
#Statistical properties

In [None]:
df.describe()

In [None]:
df.describe().T

In [None]:
# Statistical property of 'name' variable

df.describe(include=['object'])

In [None]:
# All variables

df.describe(include='all')

In [None]:
# Missing data

df.isnull().sum()

In [None]:
# Explore categorical variables

categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

In [None]:
# Preview categorical variables

df[categorical].head()

In [None]:
# Explore the variables


df['name'].isnull().sum()

In [None]:
# Uniquness

df['name'].nunique()

In [None]:
# View the frequency distribution of values

df['stall'].value_counts()

In [None]:
# View percentage of frequency distribution of values

df['stall'].value_counts()/len(df)

In [None]:
# Frequency distribution of stall variable plot

f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(y="stall", data=df, palette="Set1")
ax.set_title("Frequency distribution of stall variable")
plt.show()

In [None]:
# Frequency distribution of classes of the stall/targeted variable plot

f, ax = plt.subplots(figsize=(8, 5))
ax = df.stall.value_counts().plot(kind="bar", color="blue")
ax.set_title("Frequency distribution of stall variable")
ax.set_xticklabels(df.stall.value_counts().index, rotation=30)
plt.show()

In [None]:
# Check for cardinality in categorical variables

for var in categorical:
    
    print(var, ' contains ', len(df[var].unique()), ' labels')

In [None]:
# Explore Numerical Variables

numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :\n\n', numerical)

In [None]:
# Preview the numerical variables

df[numerical].head()

In [None]:
# Distribution of variables in the data

In [None]:
# Stall variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['stall']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of stall variable")
plt.show()

In [None]:
# Altitude variable distribution

f, ax = plt.subplots(figsize=(9,5))
x = df['altitude']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of altitude variable")
plt.show()

In [None]:
# Vertical Speed variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['vertical_speed']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of vertical_speed variable")
plt.show()

In [None]:
# Roll variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['roll']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of roll variable")
plt.show()

In [None]:
# AOA variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['AOA']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of AOA variable")
plt.show()

In [None]:
# Airspeed variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['airspeed']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of airspeed variable")
plt.show()

In [None]:
# Flight path angle variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['flight_path_angle']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of flight_path_angle")
plt.show()

In [None]:
# Pitch variable distribution

f, ax = plt.subplots(figsize=(10,8))
x = df['pitch']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of pitch variable")
plt.show()

In [None]:
# Feature Extraction and Feature Selection

In [None]:
# Correlation Matrix

corrmat = df.corr()
f, ax = plt.subplots(figsize =(9, 8))
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1)

In [None]:
# Pearson's correlation heatmap

plt.figure(figsize=(20,12))
k = 18 
cols = corrmat.nlargest(k, 'stall')['stall'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
# Feature Selection using Random Forest

In [None]:
# Split data

flight_name = df['name'].unique()

trp, tsp = train_test_split(flight_name, test_size = 0.3)

In [None]:
# Train and test sets

train_df = df[df['name'].isin(trp)]
test_df = df[df['name'].isin(tsp)]

In [None]:
# Train input and output for ML

X_train = train_df.iloc[:,13:-1] 
y_train = train_df['stall']

In [None]:
# Test input and output for ML

X_test = test_df.iloc[:,13:-1] 
y_test = test_df['stall']

In [None]:
# Feature scaling 

cols = X_train.columns

In [None]:
# Scaling

scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [None]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
# Random Forest Classifier model with 100 Decision Trees
# Instantiate the classifier with n_estimators = 100

rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the model to the training set

rfc_100.fit(X_train, y_train)

# Predict on the test set results

y_pred_100 = rfc_100.predict(X_test)

# Check accuracy score 

print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred_100)))

In [None]:
# Find important features with Random Forest model 
# Create the classifier with n_estimators = 100

clf = RandomForestClassifier(n_estimators=100, random_state=0)

# fit the model to the training set

clf.fit(X_train, y_train)

In [None]:
# View the feature scores/ Feature of importance scores

feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores

In [None]:
# Visualize feature scores of the features
# Creating a seaborn bar plot

f, ax = plt.subplots(figsize=(28, 18))
ax = sns.barplot(x=feature_scores, y=feature_scores.index)
ax.set_title("Visualizing Feature Scores of the Features", fontdict={'weight':'bold','size': 40})
ax.set_yticklabels(feature_scores.index)
ax.set_xlabel("Feature Importance Score", fontdict={'weight':'bold','size': 40})
ax.set_ylabel("Features", fontdict={'weight':'bold','size': 40})
plt.xticks(size = 35)
plt.yticks(size = 35)
plt.show()