# Series

In [1]:
import pandas as pd

In [2]:
obj1 = pd.Series([4, 7, -5, 3])
obj1

0    4
1    7
2   -5
3    3
dtype: int64

In [80]:
obj2 = pd.Series([4, 7, -5, 3], index = ['a', 'b', 'c', 'd'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [111]:
# or create a series from a dictionary
# keys will be the indice
data = {'summer':32, 'winter':10, 'spring':20, 'fall':25}
obj3 = pd.Series(data)
obj3

summer    32
winter    10
spring    20
fall      25
dtype: int64

In [112]:
obj3.values

array([32, 10, 20, 25])

In [113]:
obj3.index

Index(['summer', 'winter', 'spring', 'fall'], dtype='object')

In [114]:
obj3['summer']
# use the actual index

32

In [115]:
obj3[0]
# or use the default index

32

In [116]:
# add a value
obj3['year'] = 29
obj3

summer    32
winter    10
spring    20
fall      25
year      29
dtype: int64

In [117]:
obj3[obj3 > 20]

summer    32
fall      25
year      29
dtype: int64

In [118]:
obj3 * 2

summer    64
winter    20
spring    40
fall      50
year      58
dtype: int64

In [119]:
np.exp(obj3)

summer    7.896296e+13
winter    2.202647e+04
spring    4.851652e+08
fall      7.200490e+10
year      3.931334e+12
dtype: float64

In [120]:
'Chao' in obj3

False

In [2]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [7]:
data[data < 0]

1    -999.0
3    -999.0
4   -1000.0
dtype: float64

In [3]:
data[data < 0].values

array([ -999.,  -999., -1000.])

In [11]:
import numpy as np
data_new = data.replace(data[data < 0].values, np.nan)

In [12]:
data_new

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [13]:
np.isnan(data_new[1])

True

In [14]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [15]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [16]:
data.replace({-999: 1000000, -1000: 2000000})  
# we can also pass a dictionary to replace the values

0          1.0
1    1000000.0
2          2.0
3    1000000.0
4    2000000.0
5          3.0
dtype: float64

In [17]:
data   
# data stays unchanged

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

# DataFrame

## Creating a df from an array

In [1]:
import pandas as pd
import numpy as np

In [2]:
array = np.random.randint(0,100, 20).reshape(4, 5)
array

array([[32, 39, 95, 32, 11],
       [54, 36, 58, 16, 55],
       [44, 73, 82, 99, 49],
       [44, 64, 37, 18, 86]])

In [3]:
df1 = pd.DataFrame(array)
df1

Unnamed: 0,0,1,2,3,4
0,32,39,95,32,11
1,54,36,58,16,55
2,44,73,82,99,49
3,44,64,37,18,86


In [4]:
rlabels = ["r1", "r2", "r3", "r4"]
clabels = ['c1', 'c2', 'c3', 'c4', 'c5']
df2 = pd.DataFrame(array, index = rlabels, columns = clabels)

df2

Unnamed: 0,c1,c2,c3,c4,c5
r1,32,39,95,32,11
r2,54,36,58,16,55
r3,44,73,82,99,49
r4,44,64,37,18,86


## Creating a df from a dictionary

In [5]:
# create a list of tuples, and then convert to a dataframe

data = {'summer':32, 'winter':10, 'spring':20, 'fall':25}
temp = pd.DataFrame(list(data.items()), columns = ['month', 'temp'])
temp

Unnamed: 0,month,temp
0,summer,32
1,winter,10
2,spring,20
3,fall,25


In [6]:
# alternatively

data2 = {
    'Month': list(data.keys()),
    'Temperature': list(data.values())
}

temp2 = pd.DataFrame(data2)
temp2

Unnamed: 0,Month,Temperature
0,summer,32
1,winter,10
2,spring,20
3,fall,25


In [73]:
# create column values in a dictionary, and then convert to a dataframe

data3 = {'course': ['MSBA7001', 'MSBA7002', 'MSBA7003', 'MSBA7004', 'Capstone'],
        'instructor': ['Chao', 'Haipeng', 'Wei', 'Eric', 'Haipeng'],
        'grade': [90, 83, 79, 87.5, 95]}
msba = pd.DataFrame(data3)
msba

Unnamed: 0,course,instructor,grade
0,MSBA7001,Chao,90.0
1,MSBA7002,Haipeng,83.0
2,MSBA7003,Wei,79.0
3,MSBA7004,Eric,87.5
4,Capstone,Haipeng,95.0


In [74]:
msba.describe()
# only the grade column is numeric values

Unnamed: 0,grade
count,5.0
mean,86.9
std,6.188699
min,79.0
25%,83.0
50%,87.5
75%,90.0
max,95.0


In [75]:
msba.grade.describe()
# or you could also choose which numeric column to summarize

count     5.000000
mean     86.900000
std       6.188699
min      79.000000
25%      83.000000
50%      87.500000
75%      90.000000
max      95.000000
Name: grade, dtype: float64

In [76]:
# change the default row index

msba.index = ['r1', 'r2', 'r3', 'r4', 'r5']
msba

Unnamed: 0,course,instructor,grade
r1,MSBA7001,Chao,90.0
r2,MSBA7002,Haipeng,83.0
r3,MSBA7003,Wei,79.0
r4,MSBA7004,Eric,87.5
r5,Capstone,Haipeng,95.0


## Selecting columns and adding a new column

In [11]:
msba['grade']
# it's a Series

r1    90.0
r2    83.0
r3    79.0
r4    87.5
r5    95.0
Name: grade, dtype: float64

In [13]:
# s.values: returns an array of values
msba['grade'].values

array([90. , 83. , 79. , 87.5, 95. ])

In [14]:
msba[['grade','course']]
# to view multiple columns, insert a list

Unnamed: 0,grade,course
r1,90.0,MSBA7001
r2,83.0,MSBA7002
r3,79.0,MSBA7003
r4,87.5,MSBA7004
r5,95.0,Capstone


In [20]:
msba.grade

r1    90.0
r2    83.0
r3    79.0
r4    87.5
r5    95.0
Name: grade, dtype: float64

In [21]:
msba.grade[0:2]

r1    90.0
r2    83.0
Name: grade, dtype: float64

In [22]:
# add 'attendance' column, which is a percentage
msba.attendance = [1, 0.8, 0.9, 0.75, 0.6]
# this does not work

msba

  msba.attendance = [1, 0.8, 0.9, 0.75, 0.6]


Unnamed: 0,course,instructor,grade
r1,MSBA7001,Chao,90.0
r2,MSBA7002,Haipeng,83.0
r3,MSBA7003,Wei,79.0
r4,MSBA7004,Eric,87.5
r5,Capstone,Haipeng,95.0


In [77]:
msba['attendance'] = [-1, -0.8, -0.9, -0.75, -0.6]

In [78]:
msba

Unnamed: 0,course,instructor,grade,attendance
r1,MSBA7001,Chao,90.0,-1.0
r2,MSBA7002,Haipeng,83.0,-0.8
r3,MSBA7003,Wei,79.0,-0.9
r4,MSBA7004,Eric,87.5,-0.75
r5,Capstone,Haipeng,95.0,-0.6


In [25]:
# what is this now?
msba.attendance

# it's not pulling the attendance column

[1, 0.8, 0.9, 0.75, 0.6]

In [79]:
# add 'fav' column, which is a binary value: 0 or 1
np.random.seed(101)
msba['fav'] = np.random.randint(0, 2, size = 5)
# this works
msba

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1
r2,MSBA7002,Haipeng,83.0,-0.8,1
r3,MSBA7003,Wei,79.0,-0.9,1
r4,MSBA7004,Eric,87.5,-0.75,0
r5,Capstone,Haipeng,95.0,-0.6,1


In [27]:
msba.columns

Index(['course', 'instructor', 'grade', 'attendance', 'fav'], dtype='object')

In [28]:
sorted(list(msba.columns))

['attendance', 'course', 'fav', 'grade', 'instructor']

## Selecting rows and adding a new row

In [29]:
msba.loc['r3']   # loc works on the row labels

course        MSBA7003
instructor         Wei
grade             79.0
attendance        -0.9
fav                  1
Name: r3, dtype: object

In [30]:
msba.loc[['r1', 'r3']]
# to view multiple rows, insert a list

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1
r3,MSBA7003,Wei,79.0,-0.9,1


In [31]:
msba.loc['r1':'r3']
# or insert a slice of rows

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1
r2,MSBA7002,Haipeng,83.0,-0.8,1
r3,MSBA7003,Wei,79.0,-0.9,1


In [32]:
msba.loc['r1':'r3', 'course']
# or insert both the row label and column label, separated by a comma

r1    MSBA7001
r2    MSBA7002
r3    MSBA7003
Name: course, dtype: object

In [33]:
msba.iloc[2]   # iloc works on the default index

course        MSBA7003
instructor         Wei
grade             79.0
attendance        -0.9
fav                  1
Name: r3, dtype: object

In [34]:
msba.iloc[2:5] 

Unnamed: 0,course,instructor,grade,attendance,fav
r3,MSBA7003,Wei,79.0,-0.9,1
r4,MSBA7004,Eric,87.5,-0.75,0
r5,Capstone,Haipeng,95.0,-0.6,1


In [35]:
msba.iloc[2:5]['fav']

r3    1
r4    0
r5    1
Name: fav, dtype: int64

In [80]:
# insert a new row from a list
msba.loc['r6'] = ['MSBA7005', 'Michael',90, -0.9,1]
msba

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1
r2,MSBA7002,Haipeng,83.0,-0.8,1
r3,MSBA7003,Wei,79.0,-0.9,1
r4,MSBA7004,Eric,87.5,-0.75,0
r5,Capstone,Haipeng,95.0,-0.6,1
r6,MSBA7005,Michael,90.0,-0.9,1


In [37]:
# insert a new row using iloc?
msba.iloc[6] = ['MSBA7006', 'Dan', 60, -0.75, 1]
# this does not work
msba

IndexError: iloc cannot enlarge its target object

In [81]:
# insert a new row from a Series?
msba.loc['r7'] = pd.Series(['MSBA7007', 'Will', 79, -1, 0])
# this adds a row of NaN, becasue the labels do not match
msba

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,79.0,-0.9,1.0
r4,MSBA7004,Eric,87.5,-0.75,0.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0
r7,,,,,


In [82]:
# update the row by a tuple
msba.loc['r7'] = ('MSBA7007', 'Will', 79, -0.8, 0)
msba

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,79.0,-0.9,1.0
r4,MSBA7004,Eric,87.5,-0.75,0.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0
r7,MSBA7007,Will,79.0,-0.8,0.0


In [83]:
msba.index

Index(['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'], dtype='object')

## Selecting values from a df

In [41]:
msba.loc[msba.grade >= 90]

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0


In [42]:
msba.loc[msba.fav == 1.0]

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,79.0,-0.9,1.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0


In [43]:
# use parentheses for comparisions. & means "and"
msba.loc[(msba.fav == 1) & (msba.grade >= 90)]

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0


In [44]:
# | means "or"
msba.loc[(msba.fav == 1) | (msba.grade >= 90)]

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,79.0,-0.9,1.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0


In [46]:
msba.filter(regex = 'e$')
# filter applies on the labels, not on the df contents.

Unnamed: 0,course,grade,attendance
r1,MSBA7001,90.0,-1.0
r2,MSBA7002,83.0,-0.8
r3,MSBA7003,79.0,-0.9
r4,MSBA7004,87.5,-0.75
r5,Capstone,95.0,-0.6
r6,MSBA7005,90.0,-0.9
r7,MSBA7007,79.0,-0.8


## Removing rows and columns from a df

In [32]:
# to delete a column

msba.drop('grade', axis=1)

Unnamed: 0,course,instructor,attendance,fav
r1,MSBA7001,Chao,-1.0,1.0
r2,MSBA7002,Haipeng,-0.8,1.0
r3,MSBA7003,Wei,-0.9,1.0
r4,MSBA7004,Eric,-0.75,0.0
r5,Capstone,Haipeng,-0.6,1.0
r6,MSBA7005,Michael,-0.9,1.0
r7,MSBA7007,Will,-0.8,0.0


In [33]:
msba
# it does not change the original df

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,79.0,-0.9,1.0
r4,MSBA7004,Eric,87.5,-0.75,0.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0
r7,MSBA7007,Will,79.0,-0.8,0.0


In [84]:
# make a copy to a new df

msba2 = msba.drop('grade', axis=1)
msba2

Unnamed: 0,course,instructor,attendance,fav
r1,MSBA7001,Chao,-1.0,1.0
r2,MSBA7002,Haipeng,-0.8,1.0
r3,MSBA7003,Wei,-0.9,1.0
r4,MSBA7004,Eric,-0.75,0.0
r5,Capstone,Haipeng,-0.6,1.0
r6,MSBA7005,Michael,-0.9,1.0
r7,MSBA7007,Will,-0.8,0.0


In [85]:
# to delete rows

msba2.drop(['r1', 'r2'])

Unnamed: 0,course,instructor,attendance,fav
r3,MSBA7003,Wei,-0.9,1.0
r4,MSBA7004,Eric,-0.75,0.0
r5,Capstone,Haipeng,-0.6,1.0
r6,MSBA7005,Michael,-0.9,1.0
r7,MSBA7007,Will,-0.8,0.0


## `NaN`

In [96]:
msba3 = msba.copy()

In [97]:
msba3

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,79.0,-0.9,1.0
r4,MSBA7004,Eric,87.5,-0.75,0.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0
r7,MSBA7007,Will,79.0,-0.8,0.0


In [98]:
msba3['grade'].loc['r3'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [99]:
msba3

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,,-0.9,1.0
r4,MSBA7004,Eric,87.5,-0.75,0.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0
r7,MSBA7007,Will,79.0,-0.8,0.0


In [100]:
msba3.isnull()
# pd.DataFrame.isna also works the same

Unnamed: 0,course,instructor,grade,attendance,fav
r1,False,False,False,False,False
r2,False,False,False,False,False
r3,False,False,True,False,False
r4,False,False,False,False,False
r5,False,False,False,False,False
r6,False,False,False,False,False
r7,False,False,False,False,False


In [101]:
msba3.isnull().sum(axis = 1)

r1    0
r2    0
r3    1
r4    0
r5    0
r6    0
r7    0
dtype: int64

In [102]:
msba3.isnull().any()

course        False
instructor    False
grade          True
attendance    False
fav           False
dtype: bool

In [118]:
# to find out the row indices of NaN values
msba3.loc[msba3.isnull().any(axis = 1)].index

Index(['r3'], dtype='object')

In [103]:
# pd.DataFrame.fillna or pd.Series.fillna
# fill NaN values with new values
msba3['grade'].fillna(method = 'ffill', axis = 0)
# the NaN value is filled by its previous obersavtion

r1    90.0
r2    83.0
r3    83.0
r4    87.5
r5    95.0
r6    90.0
r7    79.0
Name: grade, dtype: float64

In [95]:
# pd.DataFrame.interpolate or pd.Series.interpolate
# insert new values to replace NaN based on various methods
msba3.interpolate(method = 'linear')
# the NaN value is replaced by the ave of its neighbors along axis 0.

Unnamed: 0,course,instructor,grade,attendance,fav
r1,MSBA7001,Chao,90.0,-1.0,1.0
r2,MSBA7002,Haipeng,83.0,-0.8,1.0
r3,MSBA7003,Wei,85.25,-0.9,1.0
r4,MSBA7004,Eric,87.5,-0.75,0.0
r5,Capstone,Haipeng,95.0,-0.6,1.0
r6,MSBA7005,Michael,90.0,-0.9,1.0
r7,MSBA7007,Will,79.0,-0.8,0.0


## Remove duplicates in a dataframe

In [36]:
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [37]:
dup =  data.duplicated()
dup

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [38]:
type(dup)

pandas.core.series.Series

In [39]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [40]:
# remove duplicates based on k1 column
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [41]:
# remove duplicates based on k1 column
# and search from the end
data.drop_duplicates(['k1'], keep = 'last')

Unnamed: 0,k1,k2
2,one,2
6,two,4


In [42]:
# what is data now?
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


## Reading from CSV files

In [72]:
data = pd.read_csv('../data_in/advertising.csv', index_col = 0)
data.shape

(200, 4)

In [73]:
data.head(3)

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3


In [74]:
data.tail(2)

Unnamed: 0,TV,radio,newspaper,sales
199,283.6,42.0,66.2,25.5
200,232.1,8.6,8.7,13.4


In [76]:
type(data)

pandas.core.frame.DataFrame

In [77]:
pd.read_csv('../data_in/advertising.csv', index_col = 0, header = None).head(3)

Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,TV,radio,newspaper,sales
1.0,230.1,37.8,69.2,22.1
2.0,44.5,39.3,45.1,10.4


In [78]:
pd.read_csv('../data_in/advertising.csv', index_col = 0, names = ['col1', 'col2', 'col3', 'col4']).head(3)

Unnamed: 0,col1,col2,col3,col4
,TV,radio,newspaper,sales
1.0,230.1,37.8,69.2,22.1
2.0,44.5,39.3,45.1,10.4


## Writing to a CSV file

In [84]:
msba.to_csv('../data_out/msba.csv')

In [85]:
msba.to_csv('../data_out/msba_new.csv', sep = '|', index = False, header = False)