In [1]:
import pandas as pd
import numpy as np

In [8]:
data = pd.Series([0.25,'a',1,'b'])
print(data)
print(type(data))
print(type(data[1]))

print(data.values)
print(data.index)


0    0.25
1       a
2       1
3       b
dtype: object
<class 'pandas.core.series.Series'>
<class 'str'>
[0.25 'a' 1 'b']
RangeIndex(start=0, stop=4, step=1)


In [20]:
data1 = pd.Series(5, index=[100, 200, 300])
print(data1)

100    5
200    5
300    5
dtype: int64


In [16]:
#Series as generalized NumPy array
#the Pandas Series has an explicitly defined index associated with the values.
#the index need not be an integer
data = pd.Series([0.25,0.5,0.75,1], index = ['a','b','c','d'])
print(data)
print(data[1:3])
print("access by index: ",data['a':'c'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
b    0.50
c    0.75
dtype: float64
access by index:  a    0.25
b    0.50
c    0.75
dtype: float64


In [19]:
#Series as specialized dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)
print(population.index)
print(population.values)



California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
[38332521 26448193 19651127 19552860 12882135]


In [24]:
dict1 = {2:'a', 1:'b', 3:'c', 4:'e', 5:'d'}
print("whole dictionary is: ",dict1)


short_series= pd.Series(dict1, index=[3, 2, 4])
print("subset series is : ",short_series)



whole dictionary is:  {2: 'a', 1: 'b', 3: 'c', 4: 'e', 5: 'd'}
subset series is :  3    c
2    a
4    e
dtype: object


In [33]:
#DataFrame as a generalized NumPy array
#a two-dimensional array with both flexible row indices and flexible column names
area_dict = {'California': 423967,  'New Jersey': 141297,
             'Florida': 170312,'Texas': 695662, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)

California    423967
New Jersey    141297
Florida       170312
Texas         695662
Illinois      149995
dtype: int64


In [42]:
#create dataframe by combining the 2 series
#even if the orders of index in the series were different
#if population of 1 state is missing, it takes Nan
#if area of 1 state is missing, it takes Nan

states = pd.DataFrame({'population': population,
                       'area': area})
print(states)
print("Index of dataframe: ",states.index)
print("columns of dataframe: ",states.columns)
print("values of dataframe: ",states.values)
print("type of column data: ",type(states['population']))
print("type of column data: ",type(states['area']))

            population      area
California  38332521.0  423967.0
Florida     19552860.0  170312.0
Illinois    12882135.0  149995.0
New Jersey         NaN  141297.0
New York    19651127.0       NaN
Texas       26448193.0  695662.0
Index of dataframe:  Index(['California', 'Florida', 'Illinois', 'New Jersey', 'New York', 'Texas'], dtype='object')
columns of dataframe:  Index(['population', 'area'], dtype='object')
values of dataframe:  [[38332521.   423967.]
 [19552860.   170312.]
 [12882135.   149995.]
 [      nan   141297.]
 [19651127.       nan]
 [26448193.   695662.]]
type of column data:  <class 'pandas.core.series.Series'>
type of column data:  <class 'pandas.core.series.Series'>


In [46]:
#DataFrame as specialized dictionary
#column name maps to a series of column data
print(states['area'])
#in a two-dimesnional NumPy array, data[0] will return the first row. 
#For a DataFrame, data['col0'] will return the first column. 


California    423967.0
Florida       170312.0
Illinois      149995.0
New Jersey    141297.0
New York           NaN
Texas         695662.0
Name: area, dtype: float64


In [47]:
#create dataframe from a single series object
df1 = pd.DataFrame(population, columns= ['population'])
print(df1)

            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135


In [53]:
#create dataframe from a list of dictionaries
data2 = [{'a': i, 'b' : i**2} for i in range(3)]
print(data2)
df2 = pd.DataFrame(data2)
print("dataframe is: ",df2)
print("dataframe columns: ",df2.columns)
print("dataframe index: ",df2.index)
print("dataframe values: ",df2.values)


[{'a': 0, 'b': 0}, {'a': 1, 'b': 1}, {'a': 2, 'b': 4}]
dataframe is:     a  b
0  0  0
1  1  1
2  2  4
dataframe columns:  Index(['a', 'b'], dtype='object')
dataframe index:  RangeIndex(start=0, stop=3, step=1)
dataframe values:  [[0 0]
 [1 1]
 [2 4]]


In [55]:
#when some values are missing in list of dictionaries
data3 = [{'a':1, 'b':2, 'c':3}, {'b':2*2, 'c':3*3, 'd':4*4}]
print(data3)
df3 = pd.DataFrame(data3)
print(df3)

[{'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': 9, 'd': 16}]
     a  b  c     d
0  1.0  2  3   NaN
1  NaN  4  9  16.0


In [61]:
#create dataframe from 2D numpy array
df4 = pd.DataFrame(np.random.randint(10,size=(3,2)),
                                    columns = ['one', 'two'],
                                    index = ['r1','r2','r3'])
print(df4)

    one  two
r1    1    8
r2    3    6
r3    2    3


In [65]:
#create dataframe from a numpy structured array
A = np.zeros(3, dtype = [('A','i8'),('B','f8')])
A
df5 =pd.DataFrame(A)
print(df5)

   A    B
0  0  0.0
1  0  0.0
2  0  0.0
