# Making a dataframe

In [47]:
import pandas as pd

* Use 'Dictionary' syntax to create a dataframe

In [48]:
df = pd.DataFrame({'a': [1, 2, 3, 4], 'b' : [5, 6, 7, 8], 'c' : [9, 10, 11, 12]})

In [49]:
df

Unnamed: 0,a,b,c
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


* Use 'List' to create a dataframe

In [50]:
a = [[1, 4, 7], [2, 5, 8], [3, 6, 9]]

In [51]:
df2 = pd.DataFrame(a)

In [52]:
df2

Unnamed: 0,0,1,2
0,1,4,7
1,2,5,8
2,3,6,9


In [53]:
df2.columns = ['a', 'b', 'c']

* change field names

In [54]:
df.columns = ['d', 'e', 'f']

In [55]:
df

Unnamed: 0,d,e,f
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


* copy dataframes

In [56]:
import copy

In [57]:
df3 = copy.deepcopy(df)

In [58]:
df3

Unnamed: 0,d,e,f
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


* extracting columns

# Extracting columns

* create a series in pandas

In [59]:
a = pd.Series([1, 2, 3, 1, 2, 3])

In [60]:
a

0    1
1    2
2    3
3    1
4    2
5    3
dtype: int64

* change index

In [61]:
a = pd.Series([1, 2, 3, 1, 2, 3], index = ['a', 'b', 'c', 'd', 'e', 'f'])

In [62]:
a

a    1
b    2
c    3
d    1
e    2
f    3
dtype: int64

# Extracting data with conditions

In [63]:
df = pd.DataFrame({'a': [i for i in range(1, 6)], 'b' : [i for i in range(6, 11)], 'c' : [i for i in range(11, 16)]})

In [64]:
df

Unnamed: 0,a,b,c
0,1,6,11
1,2,7,12
2,3,8,13
3,4,9,14
4,5,10,15


In [65]:
df[['a', 'c']]

Unnamed: 0,a,c
0,1,11
1,2,12
2,3,13
3,4,14
4,5,15


In [66]:
df[df['a'] >= 3]

Unnamed: 0,a,b,c
2,3,8,13
3,4,9,14
4,5,10,15


In [67]:
df[df['a'] >= 3][['a', 'b']]

Unnamed: 0,a,b
2,3,8
3,4,9
4,5,10


In [68]:
df[(df['a'] >= 3) & (df['b'] < 16)]

Unnamed: 0,a,b,c
2,3,8,13
3,4,9,14
4,5,10,15


In [69]:
TF = (df['a'] >= 3) & (df['b'] < 16)

In [70]:
TF

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [71]:
df[TF]

Unnamed: 0,a,b,c
2,3,8,13
3,4,9,14
4,5,10,15


In [72]:
df.drop(df.index[:4])

Unnamed: 0,a,b,c
4,5,10,15


# Working with a big dataset

In [73]:
import pandas as pd
titanic_df = pd.read_csv('train.csv')

In [74]:
type(titanic_df)

pandas.core.frame.DataFrame

In [75]:
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [76]:
live = (titanic_df['Survived'] == 1) & (titanic_df['Sex'] == 'female') & (titanic_df['Age'] <= 10.0)

In [77]:
live

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

In [78]:
titanic_df[live]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C
58,59,1,2,"West, Miss. Constance Mirium",female,5.0,1,2,C.A. 34651,27.75,,S
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
184,185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4.0,0,2,315153,22.025,,S
233,234,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S
237,238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8.0,0,2,C.A. 31921,26.25,,S
381,382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1.0,0,2,2653,15.7417,,C
448,449,1,3,"Baclini, Miss. Marie Catherine",female,5.0,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C


In [93]:
len(titanic_df[live])

19

In [94]:
pclass1 = (titanic_df['Pclass'] == 1)
pclass3 = (titanic_df['Pclass'] == 3)

pclass1_survivors = (titanic_df['Pclass'] == 1) & (titanic_df['Survived'] == 1)
pclass3_survivors = (titanic_df['Pclass'] == 3) & (titanic_df['Survived'] == 1)

In [96]:
print('Passenger class 1 survivors:', len(titanic_df[pclass1_survivors]))
print('Passenger class 3 survivors:', len(titanic_df[pclass3_survivors]))

print('Passenger class 1 total members:', len(titanic_df[pclass1]))
print('Passenger class 3 total members:', len(titanic_df[pclass3]))

Passenger class 1 survivors: 136
Passenger class 3 survivors: 119
Passenger class 1 total members: 216
Passenger class 3 total members: 491


In [98]:
print('Percentage of class 1 survivors: ', len(titanic_df[pclass1_survivors])/len(titanic_df[pclass1]) * 100, '%')
print('Percentage of class 3 survivors: ', len(titanic_df[pclass3_survivors])/len(titanic_df[pclass3]) * 100, '%')

Percentage of class 1 survivors:  62.96296296296296 %
Percentage of class 3 survivors:  24.236252545824847 %
