In [1]:
import numpy as np
import pandas as pd


In [2]:
pd.__version__

'2.2.3'

In [None]:
data=pd.Series([1,0.45,3.6,9])
data

In [None]:
print(f"The vales are {data.values} and the indexes are: {data.index}")

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

In [None]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

In [None]:
population['California':'Florida']

In [None]:
population['Florida']

In [None]:
# data can be a scalar, which is repeated to fill the specified index:
pd.Series(5, index=[100, 200, 300])

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area=pd.Series(area_dict)
area

## Constructing DataFrame objects
**A Pandas DataFrame can be constructed in a variety of ways. Here we’ll give several
examples**

## From a dictionary of Series objects.

In [None]:
states=pd.DataFrame({'Population':population,'Area':area})
states

In [None]:
print(states.index)
print(states.columns)

In [None]:
states['Area']

## From a list of dicts

In [None]:
# Even if some keys in the dictionary are missing, Pandas will fill them in with NaN (i.e., “not a number”) values
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}],index=['A','B'])

## From a two-dimensional NumPy array.

In [None]:
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])

## The Pandas Index Object (Immutable)

In [None]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

In [None]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

# Data Selection in Series

## Series as one-dimensional array

In [None]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
data['e'] = 1.25
data

In [None]:
#slicing by explicit index
print(data['a':'c'])
#slicing by implicit index
print(data[0:2])
#masking
print(data[(data>0.3) & (data<0.8)])
#fancy indexing
print(data[['a','e']])


## Indexers: loc, iloc, and ix

In [None]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data)
# explicit index when indexing
print(data[1])
# implicit index when indexing
print(data[1:3])

**Loc(Explicit Index)**

In [None]:
#loc attribute always refrence to the explicit index
print(data.loc[1])
print(data.loc[1:3])

**iLoc(Implicit Index)**

In [None]:
#loc attribute always refrence to the implicit index
print(data.iloc[1])
print(data.iloc[1:3])

# Data Selection in DataFrame

**DataFrame as a dictionary**

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,'New York': 141297, 'Florida': 170312,'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127, 'Florida': 19552860,'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

In [None]:
#acceessing via dictionary style indexing
print(data['area'])
print(data.area)

In [None]:
data['density']=data['pop']/data['area']
data

**DataFrame as two-dimensional array**

In [None]:
data.values

In [None]:
#Transpose
data.T

In [None]:
print(data.values[0])
print(data['area'])

**Implicit Indexing**

In [None]:
print(data)
print(data.iloc[:3,1:3])

**Explicit Indexing**

In [None]:
print(data.loc[:'Illinois', :'pop'])

**Loc with masking and fency indexing**

In [None]:
data.loc[data.density>100,['pop','density']]

In [None]:
#modification of values
data.iloc[0, 2] = 90
data

In [None]:
#indexing refers to columns, slicing refers to rows:
print(data['Florida':'Illinois'])
print(data[1:3])
print(data[data.density > 100])

## Methods of MultiIndex Creation

In [None]:
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df

In [None]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 1937810}
pd.Series(data)

In [None]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])


In [None]:
pop['California']

## Simple Concatenation with pd.concat

In [None]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

In [None]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
# pd.concat([df1,df2])
print(df1); print(df2)

## Specifying Set Arithmetic for Joins

In [None]:
df1 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                    columns=['name', 'food'])
df2 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                    columns=['name', 'drink'])
print(pd.merge(df1,df2))
# intersection of the two sets of inputs
#pd.merge(df6, df7, how='inner')

   name   food drink
0  Mary  bread  wine


In [4]:
#union of the input columns
print(pd.merge(df1, df2, how='outer'))

     name   food drink
0  Joseph    NaN  beer
1    Mary  bread  wine
2    Paul  beans   NaN
3   Peter   fish   NaN


## Example: US States Data

In [5]:
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')

In [7]:
print(pop.head())
print(areas.head())
print(abbrevs.head())

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


In [9]:
merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
merged

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AK,total,1990,553290.0,Alaska,AK
1,AK,under18,1990,177502.0,Alaska,AK
2,AK,total,1992,588736.0,Alaska,AK
3,AK,under18,1991,182180.0,Alaska,AK
4,AK,under18,1992,184878.0,Alaska,AK
...,...,...,...,...,...,...
2539,WY,under18,1993,137458.0,Wyoming,WY
2540,WY,total,1991,459260.0,Wyoming,WY
2541,WY,under18,1991,136720.0,Wyoming,WY
2542,WY,under18,1990,136078.0,Wyoming,WY


In [14]:
merged = merged.drop('abbreviation', axis=1)
merged

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska
...,...,...,...,...,...
2539,WY,under18,1993,137458.0,Wyoming
2540,WY,total,1991,459260.0,Wyoming
2541,WY,under18,1991,136720.0,Wyoming
2542,WY,under18,1990,136078.0,Wyoming


In [17]:
merged[merged['population'].isnull()]

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,
1873,PR,total,1990,,
1874,PR,total,1991,,
1875,PR,under18,1991,,
1876,PR,total,1993,,
1877,PR,under18,1993,,
1878,PR,under18,1992,,
1879,PR,total,1992,,
1880,PR,under18,1994,,
1881,PR,total,1994,,


In [18]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool