# Lesson's code
[https://medium.com/open-machine-learning-course/open-machine-learning-course-topic-2-visual-data-analysis-in-python-846b989675cd](https://medium.com/open-machine-learning-course/open-machine-learning-course-topic-2-visual-data-analysis-in-python-846b989675cd)

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 12
# Disable warnings in Anaconda
import warnings
warnings.simplefilter('ignore')
# We will display plots right inside Jupyter Notebook
%matplotlib inline
import matplotlib.pyplot as plt
# We will use the Seaborn library
import seaborn as sns
sns.set()
# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
# Increase the default plot size
from pylab import rcParams
rcParams['figure.figsize'] = 5, 4

In [None]:
df = pd.read_csv('../01-pandas-data-analyse/telecom_churn.csv')

In [None]:
df.head()

In [None]:
features = ['Total day minutes', 'Total intl calls']
df[features].hist(figsize=(12, 4))

In [None]:
df[features].plot(kind='density', subplots=True, layout=(1, 2), sharex=False, figsize=(12, 4))

In [None]:
sns.distplot(df['Total intl calls'])

In [None]:
_, ax = plt.subplots(figsize=(3, 4))
sns.boxplot(data=df['Total intl calls'], ax=ax);

In [None]:
_, axes = plt.subplots(1, 2, sharey=True, figsize=(6, 4))

sns.boxplot(data=df['Total intl calls'], ax=axes[0])
sns.violinplot(data=df['Total intl calls'], ax=axes[1])

In [None]:
df[features].describe()

In [None]:
_, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

sns.countplot(x='Churn', data=df, ax=axes[0]);
sns.countplot(x='Customer service calls', data=df, ax=axes[1]);

In [None]:
# Drop non-numerical variables
numerical = list(set(df.columns) - 
                 set(['State', 'International plan', 
                      'Voice mail plan', 'Area code', 'Churn',
                      'Customer service calls']))

corr_matrix = df[numerical].corr()
sns.heatmap(corr_matrix);

In [None]:
numerical = list(set(numerical) - 
                 set(['Total day charge', 'Total eve charge', 
                      'Total night charge', 'Total intl charge']))

In [None]:
plt.scatter(df['Total day minutes'], df['Total night minutes'])

In [None]:
sns.jointplot(x='Total day minutes', y='Total night minutes',
              data=df, kind='scatter')

In [None]:
sns.jointplot('Total day minutes', 'Total night minutes',
              data=df, kind="kde", color="g");

In [None]:
# pairplot may become very slow with the SVG format
%config InlineBackend.figure_format = 'png'

sns.pairplot(df[numerical])

In [None]:
sns.lmplot('Total day minutes', 'Total night minutes', data=df, 
           hue='Churn', fit_reg=False);

In [None]:
# Sometimes you can analyze an ordinal variable as numerical one
numerical.append('Customer service calls') 

fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(10, 7))

for idx, feat in enumerate(numerical): 
    ax = axes[int(idx / 4), idx % 4] 
    sns.boxplot(x='Churn', y=feat, data=df, ax=ax) 
    ax.set_xlabel('') 
    ax.set_ylabel(feat) 
fig.tight_layout();

In [None]:
_, axes = plt.subplots(1, 2, sharey=True, figsize=(10, 4)) 
sns.boxplot(x='Churn', y='Total day minutes',
            data=df, ax=axes[0]);
sns.violinplot(x='Churn', y='Total day minutes', 
               data=df, ax=axes[1]);

In [None]:
sns.factorplot(x='Churn', y='Total day minutes',
               col='Customer service calls',
#               data=df[df['Customer service calls'] < 8], 
               data=df,                
               kind="box", col_wrap=4, size=3, aspect=.8);

In [None]:
sns.countplot(x='Customer service calls', hue='Churn', data=df);

In [None]:
_, axes = plt.subplots(1, 2, sharey=True, figsize=(10, 4)) 
sns.countplot(x='International plan', hue='Churn',
              data=df, ax=axes[0]);
sns.countplot(x='Voice mail plan', hue='Churn',
              data=df, ax=axes[1]);

In [None]:
pd.crosstab(df['Churn'], df['State'])

In [None]:
df.groupby(['State'])['Churn'].agg([np.mean]).sort_values(by='mean', ascending=False).T

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.drop(['Churn', 'State'], axis=1)
X['International plan'] = X['International plan'].map({'Yes': 1, 'No': 0})
X['Voice mail plan'] = X['Voice mail plan'].map({'Yes': 1, 'No': 0})

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
%%time

tsne = TSNE(random_state=17)
tsne_repr = tsne.fit_transform(X_scaled)

In [None]:
plt.scatter(tsne_repr[:, 0], tsne_repr[:, 1], c=df['Churn'].map({False: 'green', True: 'red'}));

In [None]:
_, axes = plt.subplots(1, 2, sharey=True, figsize=(12, 5))
for i, name in enumerate(['International plan', 'Voice mail plan']):
    axes[i].scatter(tsne_repr[:, 0], tsne_repr[:, 1],
                    c=df[name].map({'Yes': 'green', 'No': 'red'}))
    axes[i].set_title(name)