In [1]:
# 3.1: Data Frames

In [3]:
# 3.1.1: Creating data frames

In [5]:
# Configure numbers in Pandas not to be in scientific notation
import pandas as pd
pd.set_option('display.float_format', '{:,.2f}'.format) # similar to options(scipen=999) in R

In [27]:
# 3.1.1.1: Create a data frame from a dictionary
x_dict = {'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}
df = pd.DataFrame(x_dict) # constructor from dictionary 
print(df) # 0, 1 is the index of the data frame
# more at https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

   a  b  c
0  1  3  5
1  2  4  6


In [28]:
# 3.1.1.2: Fundamental data frame properties
print(df.shape) # dimensions of the data frame i.e. # rows by # columns
print(df.columns) # the index of column names.  dtype of object means that the column names are strings.
print(df.dtypes) # the data type of each column

(2, 3)
Index(['a', 'b', 'c'], dtype='object')
a    int64
b    int64
c    int64
dtype: object


In [29]:
# 3.1.1.3: Fundamental data frame methods
print(df.describe()) # summary statistics and percentiles of every numerical column
print(df.head(1)) # df.head(n) only select the top most n rows
print(df.tail(1)) # df.tail(n) only select the bottom most n rows

         a    b    c
count 2.00 2.00 2.00
mean  1.50 3.50 5.50
std   0.71 0.71 0.71
min   1.00 3.00 5.00
25%   1.25 3.25 5.25
50%   1.50 3.50 5.50
75%   1.75 3.75 5.75
max   2.00 4.00 6.00
   a  b  c
0  1  3  5
   a  b  c
1  2  4  6


In [44]:
# 3.1.1.4: Selecting columns, rows and cells of data frames
print(df['a']) # Select one column.  Use the column index value i.e. column name
print(df[['a', 'c']]) # Select multiple columns (not necessarily contiguous)
print('Select the first row:')
print(df.loc[0,:]) # Select rows by the row index using loc
print(df.loc[[0, 1], :]) # Pass a list of row indices to select multiple rows (not necessarily contiguous) )
print('Select a single cell:')
print(df.loc[0, 'a']) # Use loc to select a single cell using indices.  Numbering starts with 0.
print(df.iloc[0, 0]) # Use iloc to select single cell using position.  Numbering starts with 0.

0    1
1    2
Name: a, dtype: int64
   a  c
0  1  5
1  2  6
first row:
a    1
b    3
c    5
Name: 0, dtype: int64
   a  b  c
0  1  3  5
1  2  4  6
Access a single cell:
1
1


In [45]:
# 3.1.1.5: Adding and removing columns from a data frame
df['d'] = [7, 6] # Add a new column to an existing data frame by calling the index
print(df)
df = df.drop('a', axis=1) # Remove an existing column from a data frame. axis of 1 indicates columns (axis=0 is rows).
print(df)

   a  b  c  d
0  1  3  5  7
1  2  4  6  6
   b  c  d
0  3  5  7
1  4  6  6


In [None]:
# 3.1.2: Operations on data frames

In [48]:
# 3.1.2.1: Using drop_duplicates to return unique combinations of column values
# Analogous to the "remove duplicates" button in Excel
x = {'a': [1, 1, 2, 3], 'b': [3, 3, 3, 3], 'c': [1, 2, 3, 4]}
df = pd.DataFrame(x)
print(df)
print(df.drop_duplicates()) # returns unique rows from the entire data frame
print(df[['a', 'b']].drop_duplicates()) # returns the unique values from combinations of a and b
print(df['b'].drop_duplicates()) # returns the unique values of only column b 

   a  b  c
0  1  3  1
1  1  3  2
2  2  3  3
3  3  3  4
   a  b  c
0  1  3  1
1  1  3  2
2  2  3  3
3  3  3  4
   a  b
0  1  3
2  2  3
3  3  3
0    3
Name: b, dtype: int64


In [53]:
# 3.1.2.2: Sort a data frame by its columns
# Analogous to the single/multiple sort in Excel
df.sort_values(by=['a', 'c'], ascending=[False, True]) # Multiple columns and mixed ascending/descending
# extend to any number columns of a data frame, in order of priority

Unnamed: 0,a,b,c
3,3,3,4
2,2,3,3
0,1,3,1
1,1,3,2


In [62]:
# 3.1.2.3: There are two ways to filter rows in Pandas 
#Row filter
#df.query()
#df.loc[boolean_series,:]
#Column filter
#df[[column1, column2]]
boolean_series = df['a'] == 1
print(boolean_series) # Returns a series of booleans
print(df[boolean_series]) # Only select rows where column a is 1, pass a boolean series as an index
print(df[df['a']==1]) # Or do all at once
print('Using the query method:') # Use the query method if multiple column conditions are required
print(df.query('a==1 and c==1')) # 

0     True
1     True
2    False
3    False
Name: a, dtype: bool
   a  b  c
0  1  3  1
1  1  3  2
   a  b  c
0  1  3  1
1  1  3  2
Using the query method:
   a  b  c
0  1  3  1


In [None]:
# 
#df.apply(function, axis=1) # applies a function to each row.  Function is passed a single row
#pd.isnull() # missing data

In [None]:
Import supports a variety of file formats
pd.read_csv(file_name, converters={})
pd.read_excel(file_name, sheetname, converters={}) – need to conda install xlrd pd.read_fwf(file_name, widths, converters={})

Parsing is done through the converters dictionary
converters = {column_name: data_type, …}


In [None]:
#
.index – the index of the data frame (similar to primary key in SQL)
	.loc[row_index, column_index]
Data frame methods
	.reset_index(drop=True) – reset the index to sequential numbering


In [None]:
# column accessor – [column_name] or .column_name (equivalent to $ in R).  Depends on if column_name contains spaces.
Series methods
	.value_counts() – shows the distribution of values
	.tolist() – converts the series to a list
	.map(function) – applies the function to each element
Source: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html


In [None]:
# filtering

In [None]:
# exporting
df.to_excel(file_name, index=False) – need to conda install openpyxl
df.to_csv(file_name, index=False)
