In [1]:
import pandas as pd

In [2]:
# Let us just create a dictioinary to understand about the DataFrame.

people = {
    "First": ["Me", "Myself", "I"],
    "Last" : ["He", 'She', "It"],
    "Email" : ["mehe@email.com", "myselfshe@email.com", "iit@email.com"]
}


# In this dict We can visualise the keys as the column's descripton and the values as the data of those column , then we can visualise that each row of values is meant for a single person in this case. We can Make this dict be represented as rows and columns by using Pandas.

In [3]:
df_example1 = pd.DataFrame(people)

In [4]:
df_example1

Unnamed: 0,First,Last,Email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,I,It,iit@email.com


In [5]:
# Have a look at the columns
df_example1.columns

Index(['First', 'Last', 'Email'], dtype='object')

In [6]:
# Renaming columns
# it changes names of columns in dataframe as well.
df_example1.columns = ['first_name', 'last_name', 'email'] 

In [7]:
df_example1


Unnamed: 0,first_name,last_name,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,I,It,iit@email.com


In [8]:
df_example1.columns = [x.upper() for x in df_example1.columns]
# All our column names in uppercase

In [9]:
df_example1

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,I,It,iit@email.com


In [10]:
# We wanna do replace spaces with underscores in column names.
df_example1.columns = df_example1.columns.str.replace(' ', '_')

In [11]:
df_example1.columns = [x.lower() for x in df_example1.columns]

In [12]:
df_example1

Unnamed: 0,first_name,last_name,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,I,It,iit@email.com


In [13]:
# Renaming column names , passing as dictionary
df_example1.rename(columns= {'first_name':'first', 'last_name': 'last', }, inplace = True)

In [14]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,I,It,iit@email.com


In [15]:
#Update data in rows.
#how to update a single values.
df_example1.loc[2] = ['John', 'Smith', 'JohnSmith@email.com'] # updating each value in the row.

In [16]:
df_example1


Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Smith,JohnSmith@email.com


In [17]:
df_example1.loc[2, ['last', 'email']] =  ['Doe', 'JohnDoe@email.com']
# Here we want to change some specific columns of the row, so we pass the list in the loc method accordingly

In [18]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Doe,JohnDoe@email.com


In [19]:
df_example1.loc[2, 'last'] = 'Smith' # changing just last name.

In [20]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Smith,JohnDoe@email.com


In [21]:
# If we just wanna change a single value we can use this insted of loc.
df_example1.at[2, 'last'] = 'Doe'

In [22]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Doe,JohnDoe@email.com


In [23]:
# It causes an error when people tries to change the values without using the indexers like the loc.
# Suppose we have a large dataframe and want to find john doe and wanna change its last name or email, 
# We will first create a filter to find that.
filt = (df_example1['email'] == 'JohnDoe@email.com')

###

#df_example1[filt]['last'] = 'Smith' #  If we run this we will get settings withcopywarning, but it will not actully change our data frame.

In [24]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Doe,JohnDoe@email.com


In [25]:
# Updating multiple rows of data, for example now will we change all of the email addresses in lowercase
# We will assign the column to its own lower case
df_example1['email'] = df_example1['email'].str.lower()

In [26]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Doe,johndoe@email.com


### Now we will use 4 methods which are used quiet frequently but people confuses in that. those methods are
### apply , map , applymap , replace

In [27]:
# So first we will use apply, it can work either on data frame and series
# it apply function at every value in series.
# suppose we doesnt want to check email's length to be specific so we apply len to all data.
df_example1['email'].apply(len)
# This returns the length of each email address.

0    14
1    19
2    17
Name: email, dtype: int64

In [28]:
# We can use appy to get some stats from data but also it can be used to update data
# In this example we are gonna create a function which will return uppercase of our email, but the function can be as complicated as we want it to be.
# We can also use lambda functions insted.
def update_email(email):
    return email.upper()

In [29]:
df_example1['email'].apply(update_email)
# WE dont want to execute the function we just wanna pass it so we do not put parenthesis after that.
# But here it didnt actually changed the data , to change the main data we have to set this equal to the column itself.

0         MEHE@EMAIL.COM
1    MYSELFSHE@EMAIL.COM
2      JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [30]:
df_example1['email'] = df_example1['email'].apply(update_email)

In [31]:
df_example1
# Updated data.

Unnamed: 0,first,last,email
0,Me,He,MEHE@EMAIL.COM
1,Myself,She,MYSELFSHE@EMAIL.COM
2,John,Doe,JOHNDOE@EMAIL.COM


In [32]:
# Updating email back to lowercase with lambda funtion . We will pass a lambda function in .apply().
# We are working here with strings but we can also perform numerical calculations.
df_example1['email'] = df_example1['email'].apply(lambda x:x.lower() )

In [33]:
df_example1

Unnamed: 0,first,last,email
0,Me,He,mehe@email.com
1,Myself,She,myselfshe@email.com
2,John,Doe,johndoe@email.com


In [34]:
# working with apply() on dataframes, as earlier we were using it with series.
df_example1.apply(len) 
# We can use this on entire dataframe insted of accessing a specific column.
# In this case it will return the number of data in each series in a datadrame.

first    3
last     3
email    3
dtype: int64

In [35]:
# We can also apply apply() on columns by changing the axis.
df_example1.apply(len, axis = 'columns') 


#"""
# 	 first	last	email
# 0	 Me	He	mehe@email.com       this is counting -> this way this time, axis = 'rows' as default
# 1	 Myself	She	myselfshe@email.com
# 2	 John	Doe	johndoe@email.com
# """

0    3
1    3
2    3
dtype: int64

In [36]:
# Suppose we want to get minimum value in each series, we can apply min function to our dataframe.
# So when we get the minimum value from a series of string we will get the first word in alphabetical order.
# We will get minimum series wise .
df_example1.apply(pd.Series.min)

first                 John
last                   Doe
email    johndoe@email.com
dtype: object

In [37]:
# We can also use lambda funtion with this but remember that it will work with only the series object.
# this x is a series here not a variable and the methods we will use are of series.
df_example1.apply(lambda x :x.min())
# Same response as above.

first                 John
last                   Doe
email    johndoe@email.com
dtype: object

#### Remember Running apply() on a series applies a function to every value in that series
#### And running apply() on a dataframe apllies funtion to every series in that dataframe.

In [38]:
# You must be wondering if we can apply function to every individual element and the dataframe also so here use applymap.
# applymap() only works pn dataframe, series object dont have the applymap method.
df_example1.applymap(len)

Unnamed: 0,first,last,email
0,2,2,14
1,6,3,19
2,4,3,17


In [39]:
# Making every element of our data set lowercase as here we only have string values.
df_example1 = df_example1.applymap(lambda x : x.lower())

In [40]:
df_example1

Unnamed: 0,first,last,email
0,me,he,mehe@email.com
1,myself,she,myselfshe@email.com
2,john,doe,johndoe@email.com


In [41]:
# Now the map method only work on the series, it is used for substituting each value with new value in a series.
# For example we wanna substitue some first names, we will pass a dictionary of the values we wanna substitue. 
# 
df_example1['first'].map({'me':'Chris', 'myself': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [42]:
# What if we wanna keep john as it is and replace others so we will use repllace method.
df_example1['first'] = df_example1['first'].replace({'me':'Chris', 'myself': 'Mary'})
# Same result as above but it didnt replaced john.

In [43]:
df_example1

Unnamed: 0,first,last,email
0,Chris,he,mehe@email.com
1,Mary,she,myselfshe@email.com
2,john,doe,johndoe@email.com


#### Now using a larger data set i.e stackoverflow 2019 developer's survey

In [44]:
df = pd.read_csv('data/survey_results_public.csv', index_col = 'Respondent')
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

In [45]:
pd.set_option('display.max_columns', 85) # Setting options to see the number of desired rows and coloumns of data.
pd.set_option('display.max_rows', 85)

In [46]:
# In filtering notebook we looked at filtering salaries and its column name is convertedcomp
# Now we wanna to rename that column.
df.rename(columns = {'ConvertedComp': 'SalaryUSD'}, inplace = True)

In [47]:
df['SalaryUSD'] # Therfore we have this salary usd column

Respondent
1            NaN
2            NaN
3         8820.0
4        61000.0
5            NaN
          ...   
88377        NaN
88601        NaN
88802        NaN
88816        NaN
88863        NaN
Name: SalaryUSD, Length: 88883, dtype: float64

In [48]:
# Now have a look at the hobbiyst column.
df['Hobbyist']

Respondent
1        Yes
2         No
3        Yes
4         No
5        Yes
        ... 
88377    Yes
88601     No
88802     No
88816     No
88863    Yes
Name: Hobbyist, Length: 88883, dtype: object

In [49]:
# So now we wanna map all the values in boolean such that yes = True and no = False
# We will use map method here
df['Hobbyist'].map({'Yes':True, 'No': False})

Respondent
1         True
2        False
3         True
4        False
5         True
         ...  
88377     True
88601    False
88802    False
88816    False
88863     True
Name: Hobbyist, Length: 88883, dtype: bool

In [50]:
df['Hobbyist'] = df['Hobbyist'].map({'Yes':True, 'No': False})
# This column only had yes and no so we used map, some other columns have similar data types but they have more info too
# So map will not replace that and will throw a NaN value so there we will use replace methdo.

In [51]:
df['Hobbyist']

Respondent
1         True
2        False
3         True
4        False
5         True
         ...  
88377     True
88601    False
88802    False
88816    False
88863     True
Name: Hobbyist, Length: 88883, dtype: bool