In [34]:
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [35]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [36]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
train.shape

(42000, 785)

In [38]:
X_train = train.iloc[:,1:].values
y_train = train['label'].values
X_test = test.values

In [39]:
test.shape

(28000, 784)

In [40]:
import time

### Step 1 : Fitting a random Forest

In [41]:
rf1 = RandomForestClassifier(random_state=0)
# Calculate the time
start = time. time()
rf1.fit(X_train,y_train)
end = time. time()
print('The training time is ' + str(end - start))



The training time is 6.05925726890564


In [42]:
# Submission 1
pred = rf1.predict(X_test)
submission_1 = pd.DataFrame(columns=['ImageId','Label'])
submission_1['ImageId'] = range(len(X_test))
submission_1['Label'] = pred
submission_1.to_csv('Submission_1.csv',index=None)

### Step 2 : Fit PCA on train and test data

In [43]:
np.random.seed(0)
matrix = np.vstack([X_train,X_test])

start = time. time()

pca = PCA(n_components=0.95,random_state=0)
pca.fit(matrix)

end = time.time()

print('The PCA Fitting time is ' + str(end - start))
print('Minimum components to fit PCA to at least 0.95 variability is '+  str(len(pca.explained_variance_ratio_.cumsum())))

The PCA Fitting time is 17.450759887695312
Minimum components to fit PCA to at least 0.95 variability is 154


### Step 3 : Fitting a random Forest using previous PCA

In [44]:
start = time. time()

#Transorm X_train without fitting again
X_train_pca = pca.transform(X_train)
rf2 = RandomForestClassifier(random_state=0)
rf2.fit(X_train_pca,y_train)

end = time.time()
print('The training time is ' + str(end - start))



The training time is 16.12244439125061


In [45]:
# Submission 2
X_test_pca = pca.transform(X_test)
pred = rf2.predict(X_test_pca)
submission_2 = pd.DataFrame(columns=['ImageId','Label'])
submission_2['ImageId'] = range(len(X_test))
submission_2['Label'] = pred
submission_2.to_csv('Submission_2.csv',index=None)

### Step 4, AFTER SUBMISSION

### Step 5 , Fitting PCA only on the train data

Fitting PCA on both the train and test data is wrong as test data have to hidden till prediction, thus test data can't be used in PCA.fit

In [46]:
np.random.seed(0)

start = time. time()

pca = PCA(n_components=0.95,random_state=0)
pca.fit(X_train)

#Transorm X_train without fitting again
X_train_pca = pca.transform(X_train)

rf3 = RandomForestClassifier(random_state=0)
rf3.fit(X_train_pca,y_train)

end = time.time()
print('The training time and PCA fitting is ' + str(end - start))
print('Minimum components to fit PCA to at least 0.95 variability is '+  str(len(pca.explained_variance_ratio_.cumsum())))



The training time and PCA fitting is 26.143166303634644
Minimum components to fit PCA to at least 0.95 variability is 154


In [47]:
# Submission 3
X_test_pca = pca.transform(X_test)
pred = rf3.predict(X_test_pca)
submission_3 = pd.DataFrame(columns=['ImageId','Label'])
submission_3['ImageId'] = range(len(X_test))
submission_3['Label'] = pred
submission_3.to_csv('Submission_3.csv',index=None)