# Data Loading, Storage, and File Formats
Reading data and making it accessable - data loading. 

## 6.1 Reading and Writing data in Text Format


In [31]:
import pandas as pd
import numpy as np

In [14]:
# Use names to assign default column names
# df = pd.read_csv('./datasets/ex1.csv', names=['a', 'b', 'c', 'd', 'message'])

# header=None will ignore headers and use number to represent the column index
# df = pd.read_csv('./datasets/ex1.csv', header=None)

# Select one of the column as index label 
df=pd.read_csv('./datasets/ex1.csv', index_col="message")

# Hierarchical index
# df = pd.read_csv("./datasets/csv_mindex.csv", index_col=["key1", "key2"])

# If a table not have a fixed delimiter (example is a txt file, saperated by one or multiple white space)
# Can also be read as csv
# result = pd.read_csv("examples/ex3.txt", sep="\s+")

# If there are rows in the csv file that need to be skipped
# pd.read_csv("./datasets/ex1.csv", skiprows=[0, 2, 3])





In [None]:
# Keep_default=False - don't auto convert to NA value, only convert nan valye based on the na_values
#pd.read_csv("some-file", keep_default=False, na_values=["NA"])

# For "message" column, foo and na will be converted to NaN, on "something" column, "two" will be converted to NaN
sentinels = {"message": ["foo", "na"], "something":["two"]}
#pd.read_csv("some-example", na_values=sentinels, keep_default_na=False)



### Some pandas.read_csv function arguments
| Argument | Description |
|-|-| 
| path | String indicating file system location |
| sep or delimiter | character sequence used to split fields in each row |
| header | Row number to use as column names, default is 0 |
| index_col | Column numbers or names to use as the row index |
| names | list of column names for result |
| skiprows | number of rows at beginning of file to ignore or list of row numbers to skip |
| na_values | Sequence of values to replace with NA. |
| keep_default_na | whether to use the default NA value list or not |
| comment | characters to split comments off the end of lines |
| nrows | Number of rows to read from beginning of file |



In [30]:
chunker = pd.read_csv("./datasets/ex1.csv", chunksize=1000)

tot = pd.Series([], dtype="int64")
# for piece in chunker:
# 	tot = tot.add(piece["a"].value_counts(), fill_value=0)

for p in chunker:
	print(p)

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


### Writing data to text format

In [35]:
df = pd.DataFrame(
    np.arange(20).reshape((4, 5)),
    index=["a", "b", "c", "d"],
    columns=["u", "v", "w", "x", "y"],
)
df.to_csv("./datasets/out.csv", na_rep='ehhhh')
