In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [None]:
df= pd.read_csv('/home/winkle/Downloads/train.csv',usecols=['Age','Fare','Survived'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:3],df.iloc[:,0],test_size=0.2,random_state=0)

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
sns.kdeplot(
    x=list(X_train['Age'].dropna()),  # IMPORTANT: convert to list
    fill=True
)
plt.title('Age Distribution (PDF)')

plt.subplot(122)
stats.probplot(
    list(X_train['Age'].dropna()),
    dist="norm",
    plot=plt
)
plt.title('Age QQ Plot')

plt.show()

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
sns.kdeplot(
    x=list(X_train['Fare'].dropna()),  # IMPORTANT: convert to list
    fill=True
)
plt.title('Fare Distribution (PDF)')

plt.subplot(122)
stats.probplot(
    list(X_train['Fare'].dropna()),
    dist="norm",
    plot=plt
)
plt.title('Fare QQ Plot')

plt.show()

In [None]:
clf=LogisticRegression()
clf2 = DecisionTreeClassifier()

In [None]:
clf.fit(X_train,y_train)
clf2.fit(X_train,y_train)

y_pred = clf.predict(X_test)
y_pred1 = clf2.predict(X_test)

print('Accuracy LR',accuracy_score(y_test,y_pred))
print('Accuracy DT',accuracy_score(y_test,y_pred1))

In [None]:
trf = FunctionTransformer(func=np.log1p)

In [None]:
X_train_transform = trf.fit_transform(X_train)
X_test_transform = trf.transform(X_test)

In [None]:
clf.fit(X_train_transform,y_train)
clf2.fit(X_train_transform,y_train)

y_pred = clf.predict(X_test_transform)
y_pred1 = clf2.predict(X_test_transform)

print('Accuracy LR',accuracy_score(y_test,y_pred))
print('Accuracy DT',accuracy_score(y_test,y_pred1))

In [None]:
X_transformed = trf.fit_transform(X)

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

print("LR",np.mean(cross_val_score(clf,X_transformed,y,scoring='accuracy',cv=10)))
print("DT",np.mean(cross_val_score(clf2,X_transformed,y,scoring='accuracy',cv=10)))

In [None]:
X = df.iloc[:,1:3]
y = df.iloc[:,0]

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
stats.probplot(X_train['Fare'], dist="norm", plot=plt)
plt.title('Fare Before Log')

plt.subplot(122)
stats.probplot(X_train_transform['Fare'], dist="norm", plot=plt)
plt.title('Fare After Log')

plt.show()

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
stats.probplot(X_train['Age'], dist="norm", plot=plt)
plt.title('Age Before Log')

plt.subplot(122)
stats.probplot(X_train_transform['Age'], dist="norm", plot=plt)
plt.title('Age After Log')

plt.show()

In [None]:
trf2 = ColumnTransformer([('log',FunctionTransformer(np.log1p),['Fare'])],remainder='passthrough')

X_train_transformed2 = trf2.fit_transform(X_train)
X_test_transformed2 = trf2.transform(X_test)


In [None]:
clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(X_train_transformed2,y_train)
clf2.fit(X_train_transformed2,y_train)
    
y_pred = clf.predict(X_test_transformed2)
y_pred2 = clf2.predict(X_test_transformed2)
    
print("Accuracy LR",accuracy_score(y_test,y_pred))
print("Accuracy DT",accuracy_score(y_test,y_pred2))

In [None]:
X_transformed2 = trf2.fit_transform(X)

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

print("LR",np.mean(cross_val_score(clf,X_transformed2,y,scoring='accuracy',cv=10)))
print("DT",np.mean(cross_val_score(clf2,X_transformed2,y,scoring='accuracy',cv=10)))

In [None]:
def apply_transform(transform):
    X = df.iloc[:,1:3]
    y = df.iloc[:,0]
    
    trf = ColumnTransformer([('[{"id":"121589d9-823a-488c-9250-dd39021f3126","cell_type":"code","source":"clf.fit(X_train_transform,y_train)\nclf2.fit(X_train_transform,y_train)\n\ny_pred = clf.predict(X_test)\ny_pred1 = clf2.predict(X_test)\n\nprint('Accuracy LR',accuracy_score(y_test,y_pred))\nprint('Accuracy DT',accuracy_score(y_test,y_pred1))","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"Accuracy LR 0.7262569832402235\nAccuracy DT 0.6368715083798883\n"}],"execution_count":17}]log',FunctionTransformer(transform),['Fare'])],remainder='passthrough')
    
    X_trans = trf.fit_transform(X)
    
    clf = LogisticRegression()
    
    print("Accuracy",np.mean(cross_val_score(clf,X_trans,y,scoring='accuracy',cv=10)))
    
    plt.figure(figsize=(14,4))

    plt.subplot(121)
    stats.probplot(X['Fare'], dist="norm", plot=plt)
    plt.title('Fare Before Transform')

    plt.subplot(122)
    stats.probplot(X_trans[:,0], dist="norm", plot=plt)
    plt.title('Fare After Transform')

    plt.show()
    

In [None]:
apply_transform(np.sin)