In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('D:/Data Sets/penguins_size.csv')
dfc = df.copy()
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


Check all the categorical data

In [5]:
for col in df.select_dtypes(include='object'):
    print(f'Name of column {col}')
    print(df[col].value_counts())

Name of column species
Adelie       152
Gentoo       124
Chinstrap     68
Name: species, dtype: int64
Name of column island
Biscoe       168
Dream        124
Torgersen     52
Name: island, dtype: int64
Name of column sex
MALE      168
FEMALE    165
.           1
Name: sex, dtype: int64


Find where the . is and replace it with NaN

In [3]:
df[df['sex'] == '.']

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [4]:

df['sex'][df['sex'] == '.'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex'][df['sex'] == '.'] = np.nan


In [6]:
print(df.loc[336])

species              Gentoo
island               Biscoe
culmen_length_mm       44.5
culmen_depth_mm        15.7
flipper_length_mm     217.0
body_mass_g          4875.0
sex                     NaN
Name: 336, dtype: object


Finding and filling missing values appropriately

In [7]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

Find observations with Null values

In [8]:
df[df.isnull().any(axis=1)]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
3,Adelie,Torgersen,,,,,
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,
47,Adelie,Dream,37.5,18.9,179.0,2975.0,
246,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,
286,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,
324,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,


In [16]:
df[df['culmen_depth_mm'].isnull()]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
3,Adelie,Torgersen,,,,,
339,Gentoo,Biscoe,,,,,


In [10]:
dfc.loc[[3]] = dfc.loc[[3]].fillna(dfc[dfc['species'] == 'Adelie'].mean())
dfc

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.100000,18.700000,181.000000,3750.000000,MALE
1,Adelie,Torgersen,39.500000,17.400000,186.000000,3800.000000,FEMALE
2,Adelie,Torgersen,40.300000,18.000000,195.000000,3250.000000,FEMALE
3,Adelie,Torgersen,38.791391,18.346358,189.953642,3700.662252,
4,Adelie,Torgersen,36.700000,19.300000,193.000000,3450.000000,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.800000,14.300000,215.000000,4850.000000,FEMALE
341,Gentoo,Biscoe,50.400000,15.700000,222.000000,5750.000000,MALE
342,Gentoo,Biscoe,45.200000,14.800000,212.000000,5200.000000,FEMALE


In [11]:
#{k: i for i, k in enumerate(dummy['A'].dropna().unique(), 0)}
df.sex

0        MALE
1      FEMALE
2      FEMALE
3         NaN
4      FEMALE
        ...  
339       NaN
340    FEMALE
341      MALE
342    FEMALE
343      MALE
Name: sex, Length: 344, dtype: object

In [6]:
#importing libraries
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler
encode = LabelEncoder()
impute = KNNImputer()
sca = MinMaxScaler()
for i in df.select_dtypes(include='object').columns:
    df[i][df[i].notnull()] = encode.fit_transform(df[i])


df_sca = pd.DataFrame(sca.fit_transform(df),columns=dfc.columns)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i][df[i].notnull()] = encode.fit_transform(df[i])


In [7]:
df_sca.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0.0,1.0,0.254545,0.666667,0.152542,0.291667,1.0
1,0.0,1.0,0.269091,0.511905,0.237288,0.305556,0.0
2,0.0,1.0,0.298182,0.583333,0.389831,0.152778,0.0
3,0.0,1.0,,,,,
4,0.0,1.0,0.167273,0.738095,0.355932,0.208333,0.0


In [8]:
#dfc[['sex','species','island']][dfc.notnull().all(axis=1)].iloc[:,:] =l
new_df = pd.DataFrame(impute.fit_transform(df_sca), columns=df.columns)
new_df.isnull().sum()


species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [9]:
pd.DataFrame(sca.inverse_transform(new_df),columns=df.columns).round()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0.0,2.0,39.0,19.0,181.0,3750.0,1.0
1,0.0,2.0,40.0,17.0,186.0,3800.0,0.0
2,0.0,2.0,40.0,18.0,195.0,3250.0,0.0
3,0.0,2.0,39.0,18.0,191.0,3740.0,1.0
4,0.0,2.0,37.0,19.0,193.0,3450.0,0.0
...,...,...,...,...,...,...,...
339,2.0,0.0,47.0,15.0,218.0,4970.0,1.0
340,2.0,0.0,47.0,14.0,215.0,4850.0,0.0
341,2.0,0.0,50.0,16.0,222.0,5750.0,1.0
342,2.0,0.0,45.0,15.0,212.0,5200.0,0.0


In [10]:
dt = pd.DataFrame(sca.inverse_transform(new_df),columns=df.columns)
dt['sex'] = dt['sex'].round()
dt['sex'] = dt['sex'].map({0:'Female', 1:'Male'})
dt['species'] =  dt['species'].map({0:'Adeile',1:'Chinstrap',2:'Gentoo'})
dt['island'] =  dt['island'].map({0:'Biscoe',1:'Dream',2:'Torgersen'})

In [11]:
dt

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adeile,Torgersen,39.10,18.70,181.0,3750.0,Male
1,Adeile,Torgersen,39.50,17.40,186.0,3800.0,Female
2,Adeile,Torgersen,40.30,18.00,195.0,3250.0,Female
3,Adeile,Torgersen,39.32,17.54,191.4,3740.0,Male
4,Adeile,Torgersen,36.70,19.30,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,47.20,14.90,217.6,4970.0,Male
340,Gentoo,Biscoe,46.80,14.30,215.0,4850.0,Female
341,Gentoo,Biscoe,50.40,15.70,222.0,5750.0,Male
342,Gentoo,Biscoe,45.20,14.80,212.0,5200.0,Female


Now we will impute appropriate missing values

In [None]:
#index 3 and 339 filled with mean of respective classes

adeile = df[df['species'] == 'Adelie'].mean()
gentoo = df[df.species == 'Gentoo'].mean()

df.loc[[3]] = df.loc[[3]].fillna(adeile)
df.loc[[339]] = df.loc[[339]].fillna(gentoo)

In [None]:
#label encoding
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

dt = df.copy() 
   
df['sex'][df.sex.notnull()] = encode.fit_transform(df.sex.dropna())
df['species'] = encode.fit_transform(df.species)
df['island'] = encode.fit_transform(df.island)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex'][df.sex.notnull()] = encode.fit_transform(df.sex.dropna())


In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
df = imputer.fit_transform(df)

In [None]:
dt = pd.DataFrame(df, columns = dfc.columns)
dt.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0.0,2.0,39.1,18.7,181.0,3750.0,1.0
1,0.0,2.0,39.5,17.4,186.0,3800.0,0.0
2,0.0,2.0,40.3,18.0,195.0,3250.0,0.0
3,0.0,2.0,38.791391,18.346358,189.953642,3700.662252,0.2
4,0.0,2.0,36.7,19.3,193.0,3450.0,0.0


In [None]:
dt['sex'][dt.sex >= 0.5] = 1
dt['sex'][dt.sex < 0.5] = 0

In [None]:
dtc = new_df.copy()
dtc['sex'] = dtc['sex'].map({0:'Female', 1:'Male'})
dtc['species'] =  dtc['species'].map({0:'Adeile',1:'Chinstrap',2:'Gentoo'})
dtc['island'] =  dtc['species'].map({0:'Biscoe',1:'Dream',2:'Torgersen'})

In [None]:
fig, ax = plt.subplots(figsize=(12,7))

plt.scatter(x = dtc['culmen_length_mm'][dtc.species == 'Adeile'], y = dtc['body_mass_g'][dtc.species == 'Adeile'], marker ='^')
plt.scatter(x = dtc['culmen_length_mm'][dtc.species == 'Gentoo'], y = dtc['body_mass_g'][dtc.species == 'Gentoo'], marker = 's')
plt.scatter(x = dtc['culmen_length_mm'][dtc.species == 'Chinstrap'], y = dtc['body_mass_g'][dtc.species == 'Chinstrap'], marker = 'o')

In [None]:
fig, ax = plt.subplots(figsize=(12,7))
sns.countplot(data = dt, x = dt.species)

In [None]:
dtc_melt = pd.melt(dtc,id_vars=['species','sex'])
dtc_melt.head()

Unnamed: 0,species,sex,variable,value
0,Adeile,Male,island,Biscoe
1,Adeile,Female,island,Biscoe
2,Adeile,Female,island,Biscoe
3,Adeile,Female,island,Biscoe
4,Adeile,Female,island,Biscoe


In [None]:
fig, ax = plt.subplots(figsize=(12,7))

#sns.countplot(data = dtc, x = dtc['species'][dtc.sex == 'Male'], ax=ax)
#sns.countplot(data = dtc, x = dtc['species'][dtc.sex == 'Female'], ax=ax)

sns.countplot(data = dtc, x = dtc['species'], hue='sex')#palette = {'Adeile':'#FFA500','Chinstrap':'#FF00FF',"Gentoo":'#00FFFF'} )

In [None]:
female_dtc = dtc[dtc['sex'] == 'Female'][['sex','species']].groupby('species')['sex'].count().reset_index().rename(columns={'sex':'Female'})
male_dtc =  dtc[dtc['sex'] == 'Male'][['sex','species']].groupby('species')['sex'].count().reset_index().rename(columns={'sex':'Male'})
sex_dtc = pd.merge(left = female_dtc,right = male_dtc)

In [None]:
plt.bar(height = sex_dtc.Male, x = dt.species.unique(),width=0.2, color='orchid', label= 'male' )

plt.bar(height = sex_dtc.Female, x = dt.species.unique()+.2,width =0.2, color = 'teal', label = 'female' )
plt.xticks([i+0.1 for i in range(len(dt.species.unique()))], ['Adeile','Chinstrap', 'Gentoo'])
plt.legend()

In [None]:
dtc.species.unique()

array(['Adeile', 'Chinstrap', 'Gentoo'], dtype=object)

In [None]:
colors = {'Adeile':'#FFA500','Chinstrap':'#FF00FF',"Gentoo":'#00FFFF'}


In [None]:
fig, ax = plt.subplots(figsize=(16,9))
sns.violinplot(data=dtc, y=dtc.flipper_length_mm, hue = dtc.species, x = dtc.sex ,palette = ['#FFA500','#FF00FF','#00FFFF'], inner='point')

plt.show()
fig, ax = plt.subplots(figsize=(16,9))
sns.boxplot(data=dtc, y=dtc.flipper_length_mm, x = dtc.species, palette = ['#FFA500','#FF00FF','#00FFFF','#FFA500'], )

plt.show()

In [48]:
from plotly.subplots import make_subplots
trace_bm = []
color =['darkorange','mediumorchid','teal']
for var,col in zip(dt.species.unique(),color):

    trace = go.Violin(x = dt['species'][dt['species']==var], y =dt['body_mass_g'][dt['species']==var],
                    box_visible=True,
                    meanline_visible=True,
                    points='all',
                    line_color=col,
                    name=var)
    trace_bm.append(trace)
trace_flipper = []
for var,col in zip(dt.species.unique(),color):
    trace2 = go.Violin(x = dt['species'][dt['species']==var], y =dt['flipper_length_mm'][dt['species']==var],
             box_visible=True,
             meanline_visible=True,
             points='all',
             line_color=col,
        )
    trace_flipper.append(trace2)
fig = make_subplots(rows=2, cols=1, subplot_titles=("Body Mass (g)","Flipper Length (mm)")) 
for i in trace_bm:
    fig.add_trace(i,row=1,col=1)
for j in trace_flipper:    
    fig.add_trace(j,row=2,col=1)
fig.update_layout(showlegend = False, title = 'Violin Plots',height=800)
fig.show()


In [None]:
fig = px.imshow(dt.corr(), width=1000, text_auto = True, color_continuous_scale = 'PuBu', title='Heatmap' )
fig.show()

correlation plot flipper length vs body mass index 

In [70]:
fig = px.scatter(data_frame=dt, 
           x = 'flipper_length_mm', y = 'body_mass_g',
           color = 'species', 
           color_discrete_map={'Adeile':'rgb(251,117,4)', 'Chinstrap':'rgb(167,98,188)', 'Gentoo':'rgb(4,115,116)'},
           symbol='species',
           symbol_map = {'Adeile':'circle', 'Chinstrap':'triangle-up', 'Gentoo':'square'},
           
           )
fig.update_traces(marker=dict(size=9))
fig.update_layout(title='flipper length (mm) vs body mass (g)',
                  titlefont = dict(color='black', family='Open Sans',), 
                   )
fig.show()

In [68]:
fig = px.scatter(data_frame=dt, 
           x = 'culmen_length_mm', y = 'culmen_depth_mm',
           color = 'species', 
           color_discrete_map={'Adeile':'rgb(251,117,4)', 'Chinstrap':'rgb(167,98,188)', 'Gentoo':'rgb(4,115,116)'},
           symbol='species',
           symbol_map = {'Adeile':'circle', 'Chinstrap':'triangle-up', 'Gentoo':'square'},
           
           )
fig.update_traces(marker=dict(size=9))
fig.update_layout(title='culmen length (mm) vs culmen depth (mm)',
                  titlefont = dict(color='black', family='Open Sans',), 
                   )
fig.show()
#{'Adeile':'square', 'Chinstrap':'triangule-up', 'Gentoo':'circle'}


In [None]:
#fig = go.Figure()
#fig.add_trace(go.Scatter(x=dt['flipper_length_mm'][dt.species == 'Gentoo'], y = dt['body_mass_g'][dt.species == 'Gentoo'], 
#                        mode = 'markers', marker=dict(
#                        size=11, symbol='triangle-up', line=dict(color='rgb(0,0,0)', width=0.5)
#                        ) ,name='Gentoo', marker_color = 'teal'))
#
#fig.add_trace(go.Scatter(x=dt['flipper_length_mm'][dt.species == 'Adeile'], y = dt['body_mass_g'][dt.species == 'Adeile'], 
#                         mode = 'markers', name='Adeile', marker_color='orange'))
#
#fig.add_trace(go.Scatter(x=dt['flipper_length_mm'][dt.species == 'Chinstrap'], y = dt['body_mass_g'][dt.species == 'Chinstrap'], 
#                         mode = 'markers', name='Chinstrap', marker_color='rgb(167,98,188)'))
#fig.show()

In [None]:
#fig1 = go.Figure()
#fig1.add_trace(go.Bar(x = dt['species'].value_counts().index , 
#                      y =  dt['species'][dt.sex == 'Male'].value_counts().values, 
#                      marker=dict(color = ['orange','teal','orchid'] ),
#                      text='Male', textposition='outside' ))
#fig1.add_trace(go.Bar(x = dt['species'].value_counts().index, 
#                      y =  dt['species'][dt.sex == 'Female'].value_counts().values, 
#                      marker=dict(color = ['orange','teal','orchid'] ), 
#                      text='Female', textposition='outside')) 
#
#fig1.update_layout(showlegend = False,height=800,title='Species based Gender count plot', 
#                   titlefont = dict(size =36, color='black', family='Open Sans',), 
#                   )
#fig1.show()

In [None]:
dt.groupby(['island','species'])['island'].count()

island     species  
Biscoe     Adeile        44
           Gentoo       124
Dream      Adeile        56
           Chinstrap     68
Torgersen  Adeile        52
Name: island, dtype: int64

In [61]:
fig =px.bar( data_frame=dt, y = 'sex', 
        facet_row='species',facet_row_spacing=0.10, 
        pattern_shape='species',
        color='species', 
        color_discrete_map={'Adeile':'rgb(251,117,4)', 'Chinstrap':'rgb(167,98,188)', 'Gentoo':'rgb(4,115,116)'},
         )
fig.show()      

In [55]:
fig = px.scatter(data_frame=dt, x='flipper_length_mm' , y = 'body_mass_g', 
                 facet_col='species', color='sex',
                 color_discrete_map={'Male':'darkblue','Female':'deeppink'}
                 )
fig.update_layout(showlegend = False,height=400,title='Species based Gender scatter plot', 
                  )
fig.show()

In [152]:
cols = ['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g']
x = dt.loc[:,cols].values
y = dt.loc[:,['species']].values
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(x)

In [154]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca_x = pd.DataFrame(pca.fit_transform(x),columns=['PC1','PC2','pc3','pc4'])
pca_final = pd.concat([pca_x,dt.species],axis=1)

In [155]:
pca_final

Unnamed: 0,PC1,PC2,pc3,pc4,species
0,-1.847051,0.049152,-0.232528,0.523879,Adeile
1,-1.309710,-0.426976,-0.028889,0.402564,Adeile
2,-1.372697,-0.152852,0.199931,-0.528375,Adeile
3,-1.172902,-0.395541,-0.124406,0.040760,Adeile
4,-1.883293,-0.001219,-0.618227,-0.479019,Adeile
...,...,...,...,...,...
339,1.940777,-0.465015,0.017077,-0.095161,Gentoo
340,1.840565,-0.764811,0.229880,0.003629,Gentoo
341,2.760549,0.292970,-0.429640,0.241494,Gentoo
342,1.721811,-0.701715,-0.277225,0.339282,Gentoo


In [167]:
fig = px.scatter(data_frame=pca_final, 
           x = 'PC1', y = 'PC2',
           color = 'species', 
           color_discrete_map={'Adeile':'rgb(251,117,4)', 'Chinstrap':'rgb(167,98,188)', 'Gentoo':'rgb(4,115,116)'},
           symbol='species',
           symbol_map = {'Adeile':'circle', 'Chinstrap':'triangle-up', 'Gentoo':'square'},         
           )
fig.update_traces(marker=dict(size=9))
fig.update_layout(title='Principal component 1 vs Principal component 2',
                  titlefont = dict(color='black', family='Open Sans',), 
                   )
fig.show()

In [168]:
import chart_studio.plotly as py
py.plot(fig, filename = 'Principal component 1 vs Principal component 2(penguin data)', auto_open=True)

'https://plotly.com/~sunilkumardash9/19/'

In [158]:
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2','PC3','PC4'], 
                        index=dt.select_dtypes(include='float64').columns)
loadings

Unnamed: 0,PC1,PC2,pc3,pc4
culmen_length_mm,0.455295,0.597712,0.643631,0.145554
culmen_depth_mm,-0.400915,0.797264,-0.418744,-0.168199
flipper_length_mm,0.575758,0.002241,-0.232069,-0.783991
body_mass_g,0.548156,0.0843,-0.597107,0.579553


In [159]:
pca.explained_variance_ratio_

array([0.689302  , 0.19270172, 0.0909941 , 0.02700217])

In [170]:
fig = px.bar(y =loadings.columns,x=pca.explained_variance_ratio_ * 100, 
       color=loadings.columns, orientation='h')
fig.show()

In [178]:
py.plot(fig, filename = 'PCA loadings by variables(penguin data)', auto_open=True)

'https://plotly.com/~sunilkumardash9/26/'

In [177]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=2,cols=2, horizontal_spacing=0.1,
                    subplot_titles=['PC1','PC2','PC3','PC4'])
fig.add_trace(go.Bar(x=loadings['PC1'],
              y=loadings.index,name='PC1',
              orientation='h'), row=1,col=1)
fig.add_trace(go.Bar(x=loadings['PC2'],
              y=loadings.index,name='PC2',
              orientation='h'), row=1,col=2)
fig.add_trace(go.Bar(x=loadings['pc3'],
              y=loadings.index,name='PC3',
              orientation='h'), row=2,col=1)
fig.add_trace(go.Bar(x=loadings['pc4'],
              y=loadings.index,name='PC4',
              orientation='h'), row=2,col=2)
fig.show()

In [150]:
pca.explained_variance_ratio_

array([0.689302  , 0.19270172, 0.0909941 , 0.02700217])

In [151]:
loadings

Unnamed: 0,PC1,PC2,pc3,pc4
culmen_length_mm,0.455295,0.597712,0.643631,0.145554
culmen_depth_mm,-0.400915,0.797264,-0.418744,-0.168199
flipper_length_mm,0.575758,0.002241,-0.232069,-0.783991
body_mass_g,0.548156,0.0843,-0.597107,0.579553
