### Pairplots in Python

### Gapminder Socioeconomic Data

   - We will be using GapMinder socioeconomic data that is available in the R package gapminder. The data has been saved to 
      
     a csv file which we will read into a dataframe. There are six columns in the data:
     

              1. Country
        
              2. Continent: useful for grouping data
        
              3. Year: data coveres 1952-2007
                        
              4. life_exp: the life expectancy at birth
                
              5. pop: population
                
              6. gdp_per_cap: the per capita (per person) GDP in international dollars

In [32]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

In [5]:
# matplotlib for plotting
import matplotlib.pyplot as plt
import matplotlib

# Set text size
matplotlib.rcParams['font.size'] = 18

# Seaborn for pairplots
import seaborn as sns

sns.set_context('talk', font_scale=1.2);

In [33]:
url = 'C:/Users/deepusuresh/Documents/Data Science/3. Pair Plot'

df = pd.read_csv('gapminder.csv')
df.head()

Unnamed: 0,country,incomeperperson,alcconsumption,armedforcesrate,breastcancerper100th,co2emissions,femaleemployrate,hivrate,internetuserate,lifeexpectancy,oilperperson,polityscore,relectricperperson,suicideper100th,employrate,urbanrate
0,Afghanistan,,0.03,0.5696534,26.8,75944000.0,25.60000038,,3.654121623,48.673,,0.0,,6.6843853,55.70000076,24.04
1,Albania,1914.996551,7.29,1.0247361,57.4,223747333.3,42.09999847,,44.98994696,76.918,,9.0,636.3413834,7.699329853,51.40000153,46.72
2,Algeria,2231.993335,0.69,2.306817,23.5,2932108667.0,31.70000076,0.1,12.50007331,73.131,0.420094525,2.0,590.5098143,4.848769665,50.5,65.22
3,Andorra,21943.3399,10.17,,,,,,81.0,,,,,5.362178802,,88.92
4,Angola,1381.004268,5.57,1.4613288,23.1,248358000.0,69.40000153,2.0,9.999953883,51.093,,-2.0,172.9992274,14.55467701,75.69999695,56.7


In [3]:
df.columns

Index(['country', 'incomeperperson', 'alcconsumption', 'armedforcesrate',
       'breastcancerper100th', 'co2emissions', 'femaleemployrate', 'hivrate',
       'internetuserate', 'lifeexpectancy', 'oilperperson', 'polityscore',
       'relectricperperson', 'suicideper100th', 'employrate', 'urbanrate'],
      dtype='object')

In [4]:
df.shape

(213, 16)

In [34]:
df.isnull().head()

Unnamed: 0,country,incomeperperson,alcconsumption,armedforcesrate,breastcancerper100th,co2emissions,femaleemployrate,hivrate,internetuserate,lifeexpectancy,oilperperson,polityscore,relectricperperson,suicideper100th,employrate,urbanrate
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
df.isnull().sum()

country                 0
incomeperperson         0
alcconsumption          0
armedforcesrate         0
breastcancerper100th    0
co2emissions            0
femaleemployrate        0
hivrate                 0
internetuserate         0
lifeexpectancy          0
oilperperson            0
polityscore             0
relectricperperson      0
suicideper100th         0
employrate              0
urbanrate               0
dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 16 columns):
country                 213 non-null object
incomeperperson         213 non-null object
alcconsumption          213 non-null object
armedforcesrate         213 non-null object
breastcancerper100th    213 non-null object
co2emissions            213 non-null object
femaleemployrate        213 non-null object
hivrate                 213 non-null object
internetuserate         213 non-null object
lifeexpectancy          213 non-null object
oilperperson            213 non-null object
polityscore             213 non-null object
relectricperperson      213 non-null object
suicideper100th         213 non-null object
employrate              213 non-null object
urbanrate               213 non-null object
dtypes: object(16)
memory usage: 26.7+ KB


In [39]:
df.replace(r'\s+|^$', np.nan, regex=True)

Unnamed: 0,country,incomeperperson,alcconsumption,armedforcesrate,breastcancerper100th,co2emissions,femaleemployrate,hivrate,internetuserate,lifeexpectancy,oilperperson,polityscore,relectricperperson,suicideper100th,employrate,urbanrate
0,Afghanistan,,0.03,0.5696534,26.8,75944000,25.60000038,,3.654121623,48.673,,0,,6.6843853,55.70000076,24.04
1,Albania,1914.996551,7.29,1.0247361,57.4,223747333.3,42.09999847,,44.98994696,76.918,,9,636.3413834,7.699329853,51.40000153,46.72
2,Algeria,2231.993335,0.69,2.306817,23.5,2932108667,31.70000076,0.1,12.50007331,73.131,0.420094525,2,590.5098143,4.848769665,50.5,65.22
3,Andorra,21943.3399,10.17,,,,,,81,,,,,5.362178802,,88.92
4,Angola,1381.004268,5.57,1.4613288,23.1,248358000,69.40000153,2,9.999953883,51.093,,-2,172.9992274,14.55467701,75.69999695,56.7
5,,11894.46407,8.17,,,16225000,,,80.64545455,,,,,2.1618433,,30.46
6,Argentina,10749.41924,9.35,0.560987,73.9,5872119000,45.90000153,0.5,36.00033495,75.901,0.635943801,8,768.4282997,7.765584,58.40000153,92
7,Armenia,1326.741757,13.66,2.6184384,51.6,51219666.67,34.20000076,0.1,44.00102458,74.241,,5,603.7630576,3.741587877,40.09999847,63.86
8,Aruba,,,,,35871000,,,41.80088889,75.246,,,,,,46.78
9,Australia,25249.98606,10.21,0.4862799,83.2,12970092667,54.59999847,0.1,75.8956538,81.907,1.913026109,10,2825.391095,8.470030125,61.5,88.74


### filling blank cells

In [40]:
df.describe()

Unnamed: 0,country,incomeperperson,alcconsumption,armedforcesrate,breastcancerper100th,co2emissions,femaleemployrate,hivrate,internetuserate,lifeexpectancy,oilperperson,polityscore,relectricperperson,suicideper100th,employrate,urbanrate
count,213,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
unique,213,191.0,181.0,165.0,137.0,201.0,154.0,47.0,193.0,190.0,64.0,22.0,133.0,192.0,140.0,195.0
top,Poland,,,,,,,,,,,,,,,
freq,1,23.0,26.0,49.0,40.0,13.0,35.0,66.0,21.0,22.0,150.0,52.0,77.0,22.0,35.0,10.0


### Default Pair Plot with All Data

In [None]:
sns.pairplot(df);

In [None]:
df['log_pop'] = np.log10(df['pop'])
df['log_gdp_per_cap'] = np.log10(df['gdp_per_cap'])

df = df.drop(columns = ['pop', 'gdp_per_cap'])

### Group and Color by a Variable

In [None]:
matplotlib.rcParams['font.size'] = 40
sns.pairplot(df, hue = 'continent');

### Customizing pairplot

In [None]:
sns.pairplot(df, hue = 'continent', diag_kind = 'kde', plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'}, size = 4);

In [None]:
df['decade'] = pd.cut(df['year'], bins = range(1950, 2010, 10))
df.head()

In [None]:
sns.pairplot(df, hue = 'decade', diag_kind = 'kde', vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],
             plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'}, size = 4);

In [None]:
sns.pairplot(df[df['year'] >= 2000], vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], 
             hue = 'continent', diag_kind = 'kde', plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'}, size = 4);
plt.suptitle('Pair Plot of Socioeconomic Data for 2000-2007', size = 28);

### More Customization with sns.PairGrid

In [None]:
# Create an instance of the PairGrid class.
grid = sns.PairGrid(data= df[df['year'] == 2007],
                    vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], size = 4)

# Map different plots to different sections
grid = grid.map_upper(plt.scatter, color = 'darkred')
grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
grid = grid.map_diag(plt.hist, bins = 10, color = 'darkred', edgecolor = 'k');

In [None]:
# Function to calculate correlation coefficient between two arrays
def corr(x, y, **kwargs):
    
    # Calculate the value
    coef = np.corrcoef(x, y)[0][1]
    # Make the label
    label = r'$\rho$ = ' + str(round(coef, 2))
    
    # Add the label to the plot
    ax = plt.gca()
    ax.annotate(label, xy = (0.2, 0.95), size = 20, xycoords = ax.transAxes)
    
# Create a pair grid instance
grid = sns.PairGrid(data= df[df['year'] == 2007],
                    vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], size = 4)

# Map the plots to the locations
grid = grid.map_upper(plt.scatter, color = 'darkred')
grid = grid.map_upper(corr)
grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
grid = grid.map_diag(plt.hist, bins = 10, edgecolor =  'k', color = 'darkred');

In [None]:
# Define a summary function
def summary(x, **kwargs):
    # Convert to a pandas series
    x = pd.Series(x)
    
    # Get stats for the series
    label = x.describe()[['mean', 'std', 'min', '50%', 'max']]
    
    # Convert from log to regular scale
    # Adjust the column names for presentation
    if label.name == 'log_pop':
        label = 10 ** label
        label.name = 'pop stats'
    elif label.name == 'log_gdp_per_cap':
        label = 10 ** label
        label.name = 'gdp_per_cap stats'
    else:
        label.name = 'life_exp stats'
       
    # Round the labels for presentation
    label = label.round()
    ax = plt.gca()
    ax.set_axis_off()

    # Add the labels to the plot
    ax.annotate(pd.DataFrame(label),
               xy = (0.1, 0.2), size = 20, xycoords = ax.transAxes)
    
# Create a pair grid instance
grid = sns.PairGrid(data= df[df['year'] == 2007],
                    vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], size = 4)

# Fill in the mappings
grid = grid.map_upper(plt.scatter, color = 'darkred')
grid = grid.map_upper(corr)
grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
grid = grid.map_diag(summary);