# Reconnaissance de caractères

### Exploration

#### Les données 

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets, metrics
# les données
digits = datasets.load_digits()

In [None]:
%matplotlib inline

In [None]:
digits.images.shape

In [None]:
print(digits.images)

In [None]:
print(digits.data)

In [None]:
print(digits.target)

In [None]:
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:8]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

In [None]:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
data[1:5]

In [None]:
digits


#### Analyse en composantes principales

In [None]:
from sklearn.decomposition import PCA
X=digits.data
y=digits.target
target_name=[0,1,2,3,4,5,6,7,8,9]
pca = PCA()
C = pca.fit(X).transform(X)
plt.plot(pca.explained_variance_ratio_)
plt.show()

In [None]:
# Composantes principales
plt.boxplot(C[:,0:20])
plt.show()

In [None]:
plt.scatter(C[:, 0], C[:, 1], c=y, label=target_name)
plt.show()

In [None]:
plt.figure()
for c, i, target_name in zip("rgbcmykrgb", [0, 1, 2,3,4,5,6,7,8,9], target_name):
    plt.scatter(C[y == i, 0], C[y == i, 1], c=c, label=target_name)
plt.legend()
plt.title("ACP Digits")
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(C[:, 0], C[:, 1], C[:, 2], c=y,
           cmap=plt.cm.Paired)
ax.set_title("ACP: trois premieres composantes")
ax.set_xlabel("Comp1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("Comp2")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("Comp3")
ax.w_zaxis.set_ticklabels([])
plt.show()

#### Classification non supervisée

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
est=KMeans(n_clusters=10)
est.fit(X)
classe=est.labels_
print(classe)

In [None]:
table=pd.crosstab(classe,y)
print(table)

In [None]:
plt.matshow(table)
plt.title("Matrice de Confusion")
plt.colorbar()
plt.show()

In [None]:
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(C[:, 0], C[:, 1], C[:, 2], c=classe,
           cmap=plt.cm.Paired)
ax.set_title("ACP: trois premieres composantes")
ax.set_xlabel("Comp1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("Comp2")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("Comp3")
ax.w_zaxis.set_ticklabels([])
plt.show()

### Modélisation

#### Echantillons

In [None]:
X.shape

#### K plus proches voisins

In [None]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=11)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
digit_knn=knn.fit(X_train, y_train) 
# Estimation de l'erreur de prévision
1-digit_knn.score(X_test,y_test)


In [None]:
from sklearn.grid_search import GridSearchCV
param=[{"n_neighbors":list(range(1,15))}]
knn= GridSearchCV(KNeighborsClassifier(),
   param,cv=5,n_jobs=-1)
digit_knn=knn.fit(X_train, y_train)
# paramètre optimal
digit_knn.best_params_["n_neighbors"]

In [None]:
knn = KNeighborsClassifier(n_neighbors=
    digit_knn.best_params_["n_neighbors"])
digit_knn=knn.fit(X_train, y_train) 
# Estimation de l'erreur de prévision
1-digit_knn.score(X_test,y_test)

In [None]:
y_chap = digit_knn.predict(X_test)
# matrice de confusion
table=pd.crosstab(y_test,y_chap)
print(table)
plt.matshow(table)
plt.title("Matrice de Confusion")
plt.colorbar()
plt.show()

#### Arbre de discrimination

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier()
digit_tree=tree.fit(X_train, y_train) 
# Estimation de l'erreur de prévision
1-digit_tree.score(X_test,y_test)

In [None]:
from sklearn.grid_search import GridSearchCV
param=[{"max_depth":list(range(5,15))}]
digit_tree= GridSearchCV(DecisionTreeClassifier(),param,cv=5,n_jobs=-1)
digit_opt=digit_tree.fit(X_train, y_train)

In [None]:
digit_opt.best_params_

In [None]:
tree=DecisionTreeClassifier(max_depth=11)
digit_tree=tree.fit(X_train, y_train)
# Estimation de l'erreur de prévision
1-digit_tree.score(X_test,y_test)

In [None]:
y_chap = digit_tree.predict(X_test)
# matrice de confusion
table=pd.crosstab(y_test,y_chap)
print(table)
plt.matshow(table)
plt.title("Matrice de Confusion")
plt.colorbar()
plt.show()

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
import pydot
dot_data = StringIO() 
export_graphviz(digit_tree, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_png("digit_tree.png")  

In [None]:
from IPython.display import Image
Image(filename='digit_tree.png')

Souci: l'arbre est encore trop gros, il faut élaguer.

#### Forêts aléatoires

In [None]:
from sklearn.ensemble import RandomForestClassifier 
# définition des paramètres
forest = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None,
bootstrap=True, oob_score=True)
# apprentissage
forest = forest.fit(X_train,y_train)
print(1-forest.oob_score_)

In [None]:
# erreur de prévision sur le test
1-forest.score(X_test,y_test)

In [None]:
from sklearn.grid_search import GridSearchCV
param=[{"max_features":list(range(4,64,4))}]
digit_rf= GridSearchCV(RandomForestClassifier(n_estimators=100),
   param,cv=5,n_jobs=-1)
digit_rf=digit_rf.fit(X_train, y_train)
# paramètre optimal
digit_rf.best_params_

In [None]:
forest = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
min_samples_split=2, min_samples_leaf=1, max_features=8, max_leaf_nodes=None,
bootstrap=True, oob_score=True)
# apprentissage
forest = forest.fit(X_train,y_train)
print(1-forest.oob_score_)

In [None]:
# erreur de prévision sur le test
1-forest.score(X_test,y_test)

In [None]:
# prévision
y_chap = forest.predict(X_test)
table=pd.crosstab(y_test,y_chap)
print(table)

In [None]:
plt.matshow(table)
plt.title("Matrice de Confusion")
plt.colorbar()
plt.show()

## Titanic

### Exploration

#### Les données

In [None]:
import pandas as pd
df=pd.read_csv("titanic-train.csv",skiprows=1,
  header=None,usecols=[1,2,4,5,9,11],
  names=["Surv","Classe","Genre","Age",
    "Prix","Port"],dtype={"Surv":object,
    "Classe":object,"Genre":object,"Port":object})
df.head()

In [None]:
df.shape

In [None]:
df_test=pd.read_csv("titanic-test.csv",skiprows=1,
  header=None,usecols=[1,3,4,8,10],
  names=["Classe","Genre","Age",
    "Prix","Port"],dtype={"Surv":object,
    "Classe":object,"Genre":object,"Port":object})
df_test.shape

In [None]:
df_test.head()

In [None]:
df["Surv"]=pd.Categorical(df["Surv"],ordered=False)
df["Classe"]=pd.Categorical(df["Classe"],
    ordered=False)
df["Genre"]=pd.Categorical(df["Genre"],
    ordered=False)
df["Port"]=pd.Categorical(df["Port"],ordered=False)
df.dtypes

In [None]:
df.count()

In [None]:
# imputation des valeurs manquantes
df["Age"]=df["Age"].fillna(df["Age"].median())
df.Port=df["Port"].fillna("S")
df.count()

In [None]:
df["AgeQ"]=pd.qcut(df.Age,3,labels=["Ag1","Ag2",
   "Ag3"])
df["PrixQ"]=pd.qcut(df.Prix,3,labels=["Pr1","Pr2",
   "Pr3"])
df["PrixQ"].describe()

In [None]:
df["Surv"]=df["Surv"].cat.rename_categories(
    ["Vnon","Voui"])
df["Classe"]=df["Classe"].cat.rename_categories(
    ["Cl1","Cl2","Cl3"])
df["Genre"]=df["Genre"].cat.rename_categories(
    ["Gfem","Gmas"])
df["Port"]=df["Port"].cat.rename_categories(
    ["Pc","Pq","Ps"])

In [None]:
df.head()

In [None]:
df_q=df.drop(["Age","Prix"],axis=1)
df_q.head()

In [None]:
dc=pd.DataFrame(pd.get_dummies(df_q[["Surv","Classe","Genre","Port","AgeQ","PrixQ"]]))

In [None]:
dc.head()

In [None]:
dc.columns

#### Analyse factorielle multiple des correspondances

In [None]:
from mca import mca

In [None]:
mca_df=mca(dc,benzecri=False)

In [None]:
print(mca_df.L)

In [None]:
print(mca_df.fs_c())

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
col=[1,1,2,2,2,3,3,5,5,5,6,6,6,7,7,7]
plt.scatter(mca_df.fs_c()[:, 0], mca_df.fs_c()[:, 1],c=col)
for i, j, nom in zip(mca_df.fs_c()[:, 0],  mca_df.fs_c()[:, 1], dc.columns):
    plt.text(i, j, nom)
plt.show()

In [None]:
df.head()

### Modélisation

#### Préparation des données

In [None]:
df.info()

In [None]:
df1=pd.get_dummies(df_q[["Surv","Classe","Genre","Port","AgeQ","PrixQ"]])
df1=df1.drop(["Surv_Vnon","Genre_Gmas"],axis=1)
df1.head()

In [None]:
df2=df[["Age","Prix"]]
df2.head()

In [None]:
df_c=pd.concat([df1,df2],axis=1)
df_c.columns

In [None]:
from sklearn.cross_validation import train_test_split
# variables explicatives
T=df_c.drop(["Surv_Voui"],axis=1)
# Variable à modéliser
z=df_c["Surv_Voui"]
# Extractions
T_train,T_test,z_train,z_test=train_test_split(T,z,
   test_size=0.2,random_state=11)

#### Régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
titan_logit=logit.fit(T_train, z_train)
# Erreur
1-titan_logit.score(T_test, z_test)

In [None]:
# Coefficients
titan_logit.coef_ 

In [None]:
from sklearn.grid_search import GridSearchCV
param=[{"C":[0.01,0.096,0.098,0.1,0.12,1,10]}]
logit = GridSearchCV(LogisticRegression(penalty="l1"),
   param,cv=5,n_jobs=-1)
titan_logit=logit.fit(T_train, z_train)
# paramètre optimal
titan_logit.best_params_["C"]

In [None]:
logit = LogisticRegression(C=0.98,penalty="l1")
titan_logit=logit.fit(T_train, z_train)
# Erreur
1-titan_logit.score(T_test, z_test)

In [None]:
# Coefficients
titan_logit.coef_ 

#### Arbre de discrimination

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier()
titan_tree=tree.fit(T_train, z_train) 
# Estimation de l'erreur de prévision
1-titan_tree.score(T_test,z_test)

In [None]:
param=[{"max_depth":list(range(2,10))}]
titan_tree= GridSearchCV(DecisionTreeClassifier(),
   param,cv=5,n_jobs=-1)
titan_opt=titan_tree.fit(T_train, z_train)
# paramètre optimal
titan_opt.best_params_

In [None]:
tree=DecisionTreeClassifier(max_depth=3)
titan_tree=tree.fit(T_train, z_train)
# Estimation de l'erreur de prévision
1-titan_tree.score(T_test,z_test)
# Estimation de l'erreur de prévision
# sur l'échantillon test
1-titan_tree.score(T_test,z_test)

In [None]:
# prévision de l'échantillon test
z_chap = titan_tree.predict(T_test)
# matrice de confusion
table=pd.crosstab(z_test,z_chap)
print(table)

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
import pydot
dot_data = StringIO() 
export_graphviz(titan_tree, out_file=dot_data) 
graph=pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_png("titan_tree.png")  

In [None]:
from IPython.display import Image
Image(filename='titan_tree.png')

In [None]:
T

#### Forêts aléatoires

In [None]:
# définition des paramètres
forest = RandomForestClassifier(n_estimators=500, 
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features='auto', max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# apprentissage
forest = forest.fit(T_train,z_train)
print(1-forest.oob_score_)
# erreur de prévision sur le test
1-forest.score(T_test,z_test)

In [None]:
from sklearn.grid_search import GridSearchCV
param=[{"max_features":list(range(2,8))}]
titan_rf= GridSearchCV(RandomForestClassifier(
   n_estimators=100),param,cv=5,n_jobs=-1)
titan_rf=titan_rf.fit(T_train, z_train)
# paramètre optimal
titan_rf.best_params_

In [None]:
forest = RandomForestClassifier(n_estimators=500,
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features=13, max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# apprentissage
forest = forest.fit(T_train,z_train)
print(1-forest.oob_score_)
# erreur de prévision sur le test
1-forest.score(T_test,z_test)
# prévision
z_chap = forest.predict(T_test)
# matrice de confusion
table=pd.crosstab(z_test,z_chap)
print(table)