# Organizing data with Pandas

(c) Wouter van Atteveldt, CC-BY-SA

# Outline

1. *Pandas data structures: series and dataframes*
2. Calculating and changing data
3. Combining and Merging Data


<h1>Pandas data structures: Series</h1>

+ One dimensional list of values
+ Single type
+ (like R Vector)

In [1]:
import pandas as pd
s = pd.Series([4, 3, 2])
s

0    4
1    3
2    2
dtype: int64

(ch. 5, Python for Data Analysis)

# Series are *Indexed*

+ Series has an index alongside the values
  + Like R rownames
+ Can select by name or number

In [2]:
s = pd.Series([4,3,2], index=["A", "B", "C"])
s

A    4
B    3
C    2
dtype: int64

In [3]:
s[0], s["B"], "C" in s, "D" in s

(4, 3, True, False)

# Creating series from dicts

In [4]:
ages = {"John": 23, "Mary": 46, "Mike": 33}
x = pd.Series(ages)
x

John    23
Mary    46
Mike    33
dtype: int64

+ Manually choose keys to determine order, select 

In [5]:
ages

{'John': 23, 'Mary': 46, 'Mike': 33}

In [6]:
x = pd.Series(ages, index=["Mary", "John", "Pete"])
x  # Note: int has no NA, so result is float

Mary    46.0
John    23.0
Pete     NaN
dtype: float64

# Viewing and changing indices

In [7]:
x.index

Index(['Mary', 'John', 'Pete'], dtype='object')

In [8]:
x.index = ["A", "B", "C"]
x.index.name = "Group"
x

Group
A    46.0
B    23.0
C     NaN
dtype: float64

In [9]:
x.reindex(["B", "A"])

Group
B    23.0
A    46.0
dtype: float64

In [10]:
x.reindex(["A", "B", "C"], fill_value=0)

Group
A    46.0
B    23.0
C     NaN
dtype: float64

# Dataframes

+ Rectangular data format
+ Columns are ~Series~

In [11]:
ages = data={"Name": ["John", "Sue", "Mary"], "Age": [23, 19, 21], "Group": ["A", "A", "B"]}
d = pd.DataFrame(ages)
d = pd.DataFrame(ages, columns=["Name", "Group", "Age"])
d

Unnamed: 0,Name,Group,Age
0,John,A,23
1,Sue,A,19
2,Mary,B,21


# Dataframes and indexes

+ Rows are also *indexed*
+ Can select rows, columns by name or number

In [12]:
d = pd.DataFrame(ages, columns=["Group", "Age"], index=ages["Name"])
d

Unnamed: 0,Group,Age
John,A,23
Sue,A,19
Mary,B,21


In [13]:
d.Age

John    23
Sue     19
Mary    21
Name: Age, dtype: int64

In [14]:
d.ix["Sue"]

Group     A
Age      19
Name: Sue, dtype: object

# Outline

1. Pandas data structures: series and dataframes
2. *Calculating and changing data*
3. Combining and Merging Data


# Creating and changing columns

In [15]:
d["length"] = [178, 182, 176]
d["sport"] = pd.Series({"John": "Football", "Mary": "Hockey"})
d["Group"] = ["A", "B", "B"]
d

Unnamed: 0,Group,Age,length,sport
John,A,23,178,Football
Sue,B,19,182,
Mary,B,21,176,Hockey


# Sorting Data

In [16]:
d.sort_values(by=["Group", "Age"], ascending=False)
d

Unnamed: 0,Group,Age,length,sport
John,A,23,178,Football
Sue,B,19,182,
Mary,B,21,176,Hockey


# Removing rows, columns

In [17]:
d2 = d.drop("Sue") # returns copy
d2 = d.drop("length", axis=1) # returns copy
del d2["Group"] # modifies in-place!
d2

Unnamed: 0,Age,sport
John,23,Football
Sue,19,
Mary,21,Hockey


# Reindexing

+ Specify new index
  + data is adjusted
+ Can drop, add rows or columns

In [18]:
d.reindex(["John", "Carol", "Sue"])

Unnamed: 0,Group,Age,length,sport
John,A,23.0,178.0,Football
Carol,,,,
Sue,B,19.0,182.0,


In [19]:
d.reindex(columns=["Group", "sport"])

Unnamed: 0,Group,sport
John,A,Football
Sue,B,
Mary,B,Hockey


# Selecting data

+ Select rows by slice or condition

In [20]:
# All yield same result:
d[d.Group == "B"] 
d[1:3]
d["Sue":"Mary"]
d.ix[["Sue", "Mary"]]

Unnamed: 0,Group,Age,length,sport
Sue,B,19,182,
Mary,B,21,176,Hockey


+ Select columns by name(s) or indices 

In [21]:
d[[1,3]]
d[["Age", "sport"]]

Unnamed: 0,Age,sport
John,23,Football
Sue,19,
Mary,21,Hockey


# Selecting with .ix

+ d[..] Cannot select single row
+ d.ix[row(s)] or d.ix[rows, columns]
+ Can use names, index numbers, slices

In [22]:
d.ix[1]

Group       B
Age        19
length    182
sport     NaN
Name: Sue, dtype: object

In [23]:
d.ix[[0,1], [1,2,3]]
d.ix[:2, 1:]
d.ix[d.Group == "B", "Age":"sport"]

Unnamed: 0,Age,length,sport
Sue,19,182,
Mary,21,176,Hockey


# Dropping missing values

In [24]:
d.dropna() # same as R's na.omit

Unnamed: 0,Group,Age,length,sport
John,A,23,178,Football
Mary,B,21,176,Hockey


In [25]:
d.dropna(how='all') # only drops rows with all NAs

Unnamed: 0,Group,Age,length,sport
John,A,23,178,Football
Sue,B,19,182,
Mary,B,21,176,Hockey


# Outline

1. Pandas data structures: series and dataframes
2. Calculating and changing data
3. *Combining and Merging Data*


# Concatenating data

+ add rows: `pd.concat(objects)`
+ add columns: `pd.concat(objects, axis=1)`

In [26]:
x = pd.Series([1,2,3])
y = pd.Series([4,5,6])
pd.concat([x, y])             

0    1
1    2
2    3
0    4
1    5
2    6
dtype: int64

In [27]:
pd.concat([x,y], axis=1)

Unnamed: 0,0,1
0,1,4
1,2,5
2,3,6


# Concatenate and indices

In [28]:
# ignore original indices when adding rows:
pd.concat([x,y], ignore_index=True)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [29]:
# Use index to merge data
y2 = pd.Series([5,6,7], index=[1,2,3])
y2

1    5
2    6
3    7
dtype: int64

In [30]:
pd.concat([x, y2], axis=1)

Unnamed: 0,0,1
0,1.0,
1,2.0,5.0
2,3.0,6.0
3,,7.0


# Combine series by creating a dataframe

In [31]:
pd.DataFrame({"x": x, "y":y, "y2": y2})

Unnamed: 0,x,y,y2
0,1.0,4.0,
1,2.0,5.0,5.0
2,3.0,6.0,6.0
3,,,7.0


# Adding a column to a data frame

In [32]:
gender = pd.Series({"John": "M", "Mary": "F"})

# add by creating new column
d['gender2'] = gender

# add by using concat on axis=1
gender.name = "Gender"
d = pd.concat([d, gender], axis=1)

d

Unnamed: 0,Group,Age,length,sport,gender2,Gender
John,A,23,178,Football,M,M
Mary,B,21,176,Hockey,F,F
Sue,B,19,182,,,


# Merging two dataframes

+ Merge based on index or columns
+ Type of join:
    + Inner join: keep only rows in both frames
    + Outer join: keep all rows
    + Left/Right join: keep all rows from left/right frame
+ Methods `join` and `merge`: different defaults
    + `d.join(d2)`: default left join on indices
    + `d.merge(d2)`: default inner join on common names (=R merge)

# Merging two dataframes on index

In [33]:
d2 = pd.DataFrame({"Country": ["US", "US", "UK"], 
                   "City": ["NY", "LA", "London"]}, 
                  index=["John", "Sue", "Pete"])
d2

Unnamed: 0,City,Country
John,NY,US
Sue,LA,US
Pete,London,UK


In [34]:
d.join(d2)

Unnamed: 0,Group,Age,length,sport,gender2,Gender,City,Country
John,A,23,178,Football,M,M,NY,US
Mary,B,21,176,Hockey,F,F,,
Sue,B,19,182,,,,LA,US


In [35]:
d.merge(d2, left_index=True, right_index=True, how="left")

Unnamed: 0,Group,Age,length,sport,gender2,Gender,City,Country
John,A,23,178,Football,M,M,NY,US
Mary,B,21,176,Hockey,F,F,,
Sue,B,19,182,,,,LA,US


# Merging two dataframes on columns

In [36]:
cities = pd.DataFrame({"Country": ["US", "US", "UK"], 
                       "City": ["NY", "LA", "London"]})
population = pd.DataFrame({"City": ["NY", "LA", "London"],
                          "population": [9, 4, 8]})
cities.merge(population)
                    

Unnamed: 0,City,Country,population
0,NY,US,9
1,LA,US,4
2,London,UK,8


# Computing Simple aggregate statistics

+ Often need to compute values over multiple groups
+ `series.value_counts()`: counts per values
+ `df.mean`, `df.sum`, etc: column totals (row totals: add `axis=1`)
+ `df.groupby(columns)[columns].agg(function)`: totals per group

In [37]:
d.mean()

Age        21.000000
length    178.666667
dtype: float64

In [38]:
d.Group.value_counts()

B    2
A    1
Name: Group, dtype: int64

# Aggregate statistics per group

In [39]:
d.groupby("Group")["Age", "length"].agg('mean')

Unnamed: 0_level_0,Age,length
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
A,23,178
B,20,179
