In [None]:
# Creation of dataframes with Pandas 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame(np.random.randn(2,3), columns=["First", "Second", "Third"], index=["a", "b"])
df

Unnamed: 0,First,Second,Third
a,1.699077,0.725544,-0.368596
b,1.105148,0.501836,0.524282


In [3]:
df.index # "row names"

Index(['a', 'b'], dtype='object')

In [4]:
df.columns  # "column names"

Index(['First', 'Second', 'Third'], dtype='object')

In [5]:
df2=pd.DataFrame(np.random.randn(2,3), index=["a", "b"])
df2

Unnamed: 0,0,1,2
a,-1.650977,-2.159514,0.167042
b,1.069168,-1.32036,1.879226


In [6]:
df2.columns

RangeIndex(start=0, stop=3, step=1)

In [7]:
s1 = pd.Series([1, 2, 3])
s1

0    1
1    2
2    3
dtype: int64

In [8]:
# Giving a name for the Series object with name attribute

s2 = pd.Series([4,5,6], name="b")
s2

0    4
1    5
2    6
Name: b, dtype: int64

In [9]:
# Crating a dataFrame and a column name
pd.DataFrame(s1, columns=["a"])

Unnamed: 0,a
0,1
1,2
2,3


In [10]:
# name attribute of Series s2 as the column name
pd.DataFrame(s2)

Unnamed: 0,b
0,4
1,5
2,6


In [11]:
# Multiple columns
pd.DataFrame({"a": s1, "b": s2})

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [12]:
# First way to create DataFrames from rows

df=pd.DataFrame([{"Wage" : 1000, "Name" : "Jack", "Age" : 21}, 
                 {"Wage" : 1500, "Name" : "John", "Age" : 29}])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [13]:
# Second way. Columns parameter to specify the exact order in the dataframe

df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [25]:
df = pd.DataFrame([[643272, 715.48],
                  [279044, 528.03],
                  [231853, 689.59],
                  [223027, 240.35],
                  [201810, 3817.52]],
                  columns=["Population", "Total area"],
                  index=["Helsinki", "Espoo", "Tampere", "Vantaa", "Oulu"])
df

Unnamed: 0,Polulation,Total area
Helsinki,643272,715.48
Espoo,279044,528.03
Tampere,231853,689.59
Vantaa,223027,240.35
Oulu,201810,3817.52


In [4]:
def main():
    wh = pd.read_csv("data.txt", sep="\t")
    
    # This can be done with df.shape
    r = len(wh.index)
    c = len(wh.columns)
    
    # Printing 
    print(f"Shape: {r}, {c}")
    print("Columns:")
    for col in wh.columns:
        print(col)
        
    # Here is the easier way. df.shape returns a tuple.
    print(*wh.shape)

In [5]:
main()

Shape: 490, 7
Columns:
Region 2018
Population
Population change from the previous year, %
Share of Swedish-speakers of the population, %
Share of foreign citizens of the population, %
Proportion of the unemployed among the labour force, %
Proportion of pensioners of the population, %
490 7


In [6]:
# Accessing columns and rows

# Let's first create new dataframe

df=pd.DataFrame(np.random.randn(2,3), columns=["First", "Second", "Third"], index=["a", "b"])
df

Unnamed: 0,First,Second,Third
a,-1.318267,0.062271,-1.54905
b,-0.008841,-0.747421,0.238967


In [7]:
# the following indexing will not work, because the explicit column index consist 
# of the string column names,

try:
    df[0]
except KeyError:
    import sys
    print("Key error", file=sys.stderr)

Key error


In [11]:
# This will however work.

df[["First"]]

Unnamed: 0,First
a,-1.318267
b,-0.008841


In [12]:
# fancy indexing

df[["First", "Second"]]

Unnamed: 0,First,Second
a,-1.318267,0.062271
b,-0.008841,-0.747421


In [13]:
df[0:1] # slice a row

Unnamed: 0,First,Second,Third
a,-1.318267,0.062271,-1.54905


In [14]:
df[df.Third > 0] # boolean mask 

Unnamed: 0,First,Second,Third
b,-0.008841,-0.747421,0.238967


In [15]:
"""If some of the above calls return a Series object, then you can chain the 
bracket calls to get a single value from the DataFrame"""

df["Second"][1]

-0.7474208694739515

In [31]:
# Alternative indexing and data selection with loc and iloc. the former uses 
# explicit indices and the latter uses the implicit integer indices. 

df=pd.DataFrame(np.random.randn(2,3), columns=["First", "Second", "Third"], index=["a", "b"])
df


Unnamed: 0,First,Second,Third
a,2.60399,-0.452106,-1.048698
b,0.296766,-2.939868,-0.440902


In [32]:
df.loc["b", "First"] # explicit indices

0.2967661905189625

In [33]:
df.iloc[-1,-1]  # implicit integer indices

-0.4409022749522882

In [35]:
df.loc["a", ["First", "Third"]]

First    2.603990
Third   -1.048698
Name: a, dtype: float64