In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
data = pd.read_table("../input/airfoil-self-noise/airfoil_self_noise.dat",sep="\t",header=None)
data.head()

In [5]:
data.columns = ["Frequency","Angle","Chord length","Free-stream velocity","Suction side displacement thickness","sound pressure"]

In [6]:
data.head()

### Checking for Nan value and handling missing values

In [7]:
data.isnull().sum()

##### Since we don't have any missing value and as all datatypes are numeric we can proceed with splitting data and model selection

In [8]:
data.corr()

In [9]:
from sklearn.model_selection import train_test_split
X = data.iloc[:,:-1]
y = np.array(data.iloc[:,-1])
y = y.reshape((1503,1))

In [10]:
x_train,x_test,y_train,y_test = train_test_split(X,y,random_state=42)
print(x_train.shape)
y
print(y_train.shape)

In [11]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
L_regressor = LinearRegression()
R_regressor = Ridge()
La_regressor = Lasso()
DT_regressor = DecisionTreeRegressor()

## Check the performance of all regression models

In [12]:
from sklearn.model_selection import cross_val_score
mse=cross_val_score(L_regressor,X,y,scoring='neg_mean_squared_error',cv=5)
mean_mse=np.mean(mse)
print("Mean_squared_error for Linear_Regression: ",mean_mse)

In [13]:
from sklearn.model_selection import GridSearchCV
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
def check_performance(model,x_train,y_train):
    regressor=GridSearchCV(model,parameters,scoring='neg_mean_squared_error',cv=5)
    regressor.fit(x_train,y_train)
    print(regressor.best_params_)
    print(regressor.best_score_)
print(check_performance(La_regressor,x_train,y_train))
print(check_performance(R_regressor,x_train,y_train))

###### When comparing three LR, there is no much difference but ridge and lasso have some better performance, Now lets check for Decision tree

In [14]:
regressor = DecisionTreeRegressor(random_state=0)
mse = cross_val_score(regressor,X,y,scoring='neg_mean_squared_error', cv=5)
mean_mse = np.mean(mse)
print(mean_mse)

#### As we see the DecisionTreeregressor model is even better and performing well, so we are going to take this model for our dataset

In [15]:
Model = DecisionTreeRegressor()
Model.fit(x_train,y_train)

In [16]:
from sklearn import tree
import matplotlib.pyplot as plt
plt.figure(figsize=(150,100))
tree.plot_tree(Model,filled=True)

### Finally We will sasve the model in a pickle file for further use in app build

In [19]:
import pickle
pickle.dump(Model, open('model.pkl', 'wb'))

In [20]:
from sklearn.metrics import r2_score
y_predict = Model.predict(x_test)
r2_score(y_predict,y_test)

### we will create a table and see how close we were in prediction visually/manually

In [21]:
copy = x_test.copy()
copy["y_test_pressure"] = y_test
copy["y_pred_pressure"] = y_predict
copy

##### This is just to see/validate manually that how close our values are

In [22]:
y_predict = Model.predict(x_test)
plt.scatter(y_predict,y_test)   # just a plot to see how close the values are with respect to y=x curve