# Reading & Writing to & from CSV, Excel files

- Both CSV, Excel files have similar method. Do refer to the documentation for more information.

In [2]:
import pandas as pd

## Reading from CSV file

- Complete documentation: [here](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html?highlight=read_csv#pandas-read-csv)

- **`pd.read_csv()`** method

In [3]:
# read a CSV file
df = pd.read_csv('data/stocks_data.csv')
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


In [5]:
# names argument -> column names to use
df = pd.read_csv('data/stocks_data.csv', names = ['TICKERS', 'EPS', 'REVENUE', 'PRICE', 'PEOPLE'])
df

Unnamed: 0,TICKERS,EPS,REVENUE,PRICE,PEOPLE
0,tickers,eps,revenue,price,people
1,GOOGL,27.82,87,845,larry page
2,WMT,4.61,484,65,n.a.
3,MSFT,-1,85,64,bill gates
4,RIL,not available,50,1023,mukesh ambani
5,TATA,5.6,-1,n.a.,ratan tata


In [7]:
# index_col argument -> specify the index column
df = pd.read_csv('data/stocks_data.csv', index_col='tickers')
df

Unnamed: 0_level_0,eps,revenue,price,people
tickers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GOOGL,27.82,87,845,larry page
WMT,4.61,484,65,n.a.
MSFT,-1,85,64,bill gates
RIL,not available,50,1023,mukesh ambani
TATA,5.6,-1,n.a.,ratan tata


In [8]:
# usecols argument -> read only specific columns
df = pd.read_csv('data/stocks_data.csv', usecols=['tickers', 'eps'])
df

Unnamed: 0,tickers,eps
0,GOOGL,27.82
1,WMT,4.61
2,MSFT,-1
3,RIL,not available
4,TATA,5.6


In [10]:
# skiprows argument -> specify the number of rows to skip from starting
df = pd.read_csv('data/stocks_data.csv')
print(df) # original
df = pd.read_csv('data/stocks_data.csv', skiprows=2)
df # skips first 2 rows

  tickers            eps  revenue price         people
0   GOOGL          27.82       87   845     larry page
1     WMT           4.61      484    65           n.a.
2    MSFT             -1       85    64     bill gates
3    RIL   not available       50  1023  mukesh ambani
4    TATA            5.6       -1  n.a.     ratan tata


Unnamed: 0,WMT,4.61,484,65,n.a.
0,MSFT,-1,85,64,bill gates
1,RIL,not available,50,1023,mukesh ambani
2,TATA,5.6,-1,n.a.,ratan tata


In [11]:
# nrows argument -> specifies number of rows to read
df = pd.read_csv('data/stocks_data.csv', nrows=3)
df # reads first 3 rows

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1.0,85,64,bill gates


In [4]:
# parse_dates argument -> specify the list of column which should be treated as dates and not string
# by default, in csv files, they are treated as string
df_1 = pd.read_csv('data/weather_data.csv')
print(type(df_1['day'][0]))
df_1 = pd.read_csv('data/weather_data.csv', parse_dates=['day'])
print(type(df_1['day'][0]))
df_1

<class 'str'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,35,7,Sunny
2,2017-01-03,28,2,Snow
3,2017-01-04,24,7,Snow
4,2017-01-05,32,4,Rain
5,2017-01-06,31,2,Sunny


### Working with na-values

- Real world data might contain impurities

In [12]:
# example of invalid data -> observe stuff like 'n.a.', 'not available'
df = pd.read_csv('data/stocks_data.csv')
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


- For further computations, its good to replace such invalid data with `numpy.NaN`.

In [13]:
# na_values argument -> specify what are invalid values -> replaced by numpy.NaN
df = pd.read_csv('data/stocks_data.csv', na_values=['n.a.', 'not available'])
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845.0,larry page
1,WMT,4.61,484,65.0,
2,MSFT,-1.0,85,64.0,bill gates
3,RIL,,50,1023.0,mukesh ambani
4,TATA,5.6,-1,,ratan tata


In [14]:
# notice that -1 in revenue might be invalid
# however, if you give -1 in na_values directly, eps cell with -1 will also get replaced -> undesirable
df = pd.read_csv('data/stocks_data.csv', na_values=['n.a.', 'not available', -1])
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87.0,845.0,larry page
1,WMT,4.61,484.0,65.0,
2,MSFT,,85.0,64.0,bill gates
3,RIL,,50.0,1023.0,mukesh ambani
4,TATA,5.6,,,ratan tata


In [15]:
# solution is to specify column wise na_values as a dict
df = pd.read_csv('data/stocks_data.csv', na_values={
    'revenue': [-1],
    'eps': ['not available'],
    'price': ['n.a.'],
    'people': ['n.a.']
})
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87.0,845.0,larry page
1,WMT,4.61,484.0,65.0,
2,MSFT,-1.0,85.0,64.0,bill gates
3,RIL,,50.0,1023.0,mukesh ambani
4,TATA,5.6,,,ratan tata


### Writing to CSV file

- **`df.to_csv()`** method

- [documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html)

In [17]:
# save df to a file
df = pd.read_csv('data/stocks_data.csv')
df.to_csv('data/temp/new_csv_file.csv')
# notice how the file looks

In [18]:
# index argument -> specify whether or not index column is to be written
df.to_csv('data/temp/new_csv_file.csv', index=False)

In [21]:
# header argument -> specify the header column names
df.to_csv('data/temp/new_csv_file.csv', index=False, header=['TICKERS', 'EPS', 'REVENUE', 'PRICE', 'PEOPLE'])

## Working with Excel files

In [23]:
# read excel file
df = pd.read_excel('data/stock_data.xlsx')
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


- `read_excel()` has a lot of similar methods as that of `read_csv()`.

- [documentation for `read_excel()`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html?highlight=read_excel#pandas.read_excel)

### Writing to excel file

- [documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html#pandas.DataFrame.to_excel)

In [24]:
# writing a single excel file
df.to_excel('data/temp/new_excel_file.xlsx', sheet_name='stocks')

In [30]:
# writing multiple sheets to a single excel file
df_stocks = pd.read_excel('data/stock_data.xlsx')
df_weather = pd.read_csv('data/weather_data.csv')

with pd.ExcelWriter('data/stocks_weather_data.xlsx') as writer:
    df_stocks.to_excel(writer, sheet_name='stock_data', index=False)
    df_weather.to_excel(writer, sheet_name='weather_data', index=False)