In [1]:
import pandas as pd

# Reading nicely formateed csv file
dataf = pd.read_csv("files/stock_data-perfect.csv")
dataf

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


In [2]:
# Reading csv file with extra header
# Reading csv file with extra headers can produce bogus DataFrame format
# To prevent this from happening, we choose a header

# By attribute header=1, we are choosing the second no. of header. (Count of headers starts from zero)
dataf = pd.read_csv("files/stock_data-extra-header.csv", header=1)
dataf


Unnamed: 0,tickers,esp,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


In [3]:
# Reading csv file with no headers
# Pandas sets the headers as 0, 1, 3, etc when no headers are setted in csv file
# In order to set our own meaningul headers, we use 'name' attribute and pass an array of strings


# 'nrows' attribute is an attribute that sets the number of rows to be outputed
# This attribte comes in handy when there's a humongus dataset
dataf = pd.read_csv("files/stock_data-no-header.csv", names=["tickers", "esp", "revenue", "price", "people"], nrows=5)
dataf

Unnamed: 0,tickers,esp,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


In [4]:
# We can see that our obtained dataframe cotains some false values
# that can produce bogus results when doing statistical analysis
# In order to prevent this, we replace these falsy values by a value
# of which we will take special care of when doing analysis

# We use 'na_values' attribute to support this feature
dataf = pd.read_csv("files/stock_data-perfect.csv", na_values={
    "eps": ["not available", "n.a."],
    "revenue": ["not available", "n.a.", -1],
    "people": ["not available", "n.a."]
})

dataf

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87.0,845,larry page
1,WMT,4.61,484.0,65,
2,MSFT,-1.0,85.0,64,bill gates
3,RIL,,50.0,1023,mukesh ambani
4,TATA,5.6,,n.a.,ratan tata


In [5]:
# As observed above, our dataframe is now clean
# To save these changes into another csv file, we use this method

# By default the indexes are also being stored in the csv file which
# is absolutely not what we want to happen
dataf.to_csv("files/new_stock.csv")

# In order to stop indexes from saving, we use
dataf.to_csv("files/new_stock.csv", index=False)

In [6]:
# Sometimes our requirement is just to save certain columns of datafram to be
# saved in csv, we use 'columns' attribute to support this feature
dataf.to_csv("files/new_stock.csv", index=False, columns=["tickers", "eps"])

In [7]:
# We can also stop header of dataframe from saving in csv
dataf.to_csv("files/new_stock.csv", index=False, header=False)