In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [4]:
df["day"][df["event"]=="Sunny"]

1    1/2/2017
5    1/6/2017
Name: day, dtype: object

In [5]:
# Another way to load in data is in the format of a dictionary
weather_data = {
    "day": ["1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017"],
    "temperature": [32, 35, 28, 24, 32, 31],
    "windspeed": [6, 7, 2, 7, 4, 2],
    "event": ["Rain", "Sunny", "Snow", "Snow", "Rain", "Sunny"]
}

df = pd.DataFrame(weather_data)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [6]:
print(df.shape)

(6, 4)


In [7]:
rows, columns = df.shape
rows, columns

(6, 4)

In [8]:
# Prints the initial 4 or 5 line of a data set
# df.head()
# Number of initial lines to be printed can be set explicitly
df.head(3)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow


In [9]:
# Prints the ending 4 or 5 line of a data set
# df.tail()
# Number of ending lines to be printed can be set explicitly
df.tail(3)

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [10]:
# Slicing can also be done on dataframes
df[2:4]

Unnamed: 0,day,temperature,windspeed,event
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow


In [11]:
df.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [12]:
df["event"] # or df.event

0     Rain
1    Sunny
2     Snow
3     Snow
4     Rain
5    Sunny
Name: event, dtype: object

In [13]:
# Sometimes when doing data analysis, only certain field are used out of many variables
# In order to print the only needed fields, we can do use the following method
df[["day", "event"]]

Unnamed: 0,day,event
0,1/1/2017,Rain
1,1/2/2017,Sunny
2,1/3/2017,Snow
3,1/4/2017,Snow
4,1/5/2017,Rain
5,1/6/2017,Sunny


In [14]:
print(type(df.event))
print(type(df.temperature))
print(type(df.columns))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.base.Index'>


In [15]:
# To do the basic statistics quickly, this method comes handy
df.describe()

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


In [16]:
df[df["temperature"]==df["temperature"].max()]

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2017,35,7,Sunny


In [17]:
df[["day", "event", "temperature"]][df.temperature==df.temperature.max()]

Unnamed: 0,day,event,temperature
1,1/2/2017,Sunny,35


In [18]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [19]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [20]:
df.set_index("day", inplace=True)

In [21]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Rain
1/6/2017,31,2,Sunny


In [22]:
df.loc["1/2/2017"]

temperature       35
windspeed          7
event          Sunny
Name: 1/2/2017, dtype: object

In [23]:
df.reset_index(inplace=True)

In [24]:
df.set_index("event", inplace=True)
df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Rain,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [25]:
df.loc["Sunny"]

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sunny,1/2/2017,35,7
Sunny,1/6/2017,31,2
