# read_excel

In [None]:
import pandas as pd
df = pd.read_excel('./data/usa_email_sample_db.xlsx')
df

# Read tab separated

In [None]:
import pandas as pd
df = pd.read_csv('./data/usa_email_sample_db.txt', sep="\t")
df

# Read csv (with commas in the column)
- Try opening this in excel (everything will be read in one column)
- This will load fine with this data

In [None]:
import pandas as pd
df = pd.read_csv('./data/usa_email_sample_db.csv')
df

In [None]:
# Read from http, https, ftp, S3
import pandas as pd
df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/boot/acme.csv')
df

# Read parquet

In [2]:
! pip3 install python-snappy==0.5.4 --user # a requisite for snappy compression with fastparquet



In [4]:
df.to_parquet('./data/usa_email_sample_db.parquet')

# Read from clipboard

In [None]:
import pandas as pd
df = pd.read_clipboard(sep='\s+') # ‘s+’ denotes one or more whitespace characters.
df

# Read Pickled objects
- Python pickle module is used for serializing and de-serializing a Python object structure. ... Pickling is a way to convert a python object (list, dict, etc.) into a character stream. The idea is that this character stream contains all the information necessary to reconstruct the object in another python script.

In [None]:
import pandas as pd
df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
pd.to_pickle(df, "./data/dummy.pkl")
unpickled_df = pd.read_pickle("./data/dummy.pkl")
unpickled_df

# Read json
- Orient parameter is key [Read more](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html)

In [None]:
import pandas as pd
df = pd.read_json('./data/test.json', orient='records')
df

# Read clipboard

In [None]:
import pandas as pd
df = pd.read_clipboard(sep='\s+') # ‘s+’ denotes one or more whitespace characters.
df

In [None]:
import pandas as pd
df = pd.read_csv('./data/error_tab_line.txt', sep="\t")
df

# How do we address load issues
- Get a delimiter that is not part of the string in any of the columns (psv is one extensively used by db guys)
- Manually Clean up the file before reading (How if there are millions of rows?)
- Use parameter "error_bad_lines" and ignore the bad lines and move on (Clean it up later)

In [None]:
import pandas as pd
df = pd.read_csv('./data/error_tab_line.txt', sep="\t", error_bad_lines=False)
df