In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')

In [3]:
people = {
    "first": ["Corey", "Jane", "John"],
    "last": ["Schafer", "Doe", "Doe"],
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@gmail.com", "JohnDoe@gmail.com"]
}

In [4]:
df_test = pd.DataFrame(people)

In [5]:
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


## Updating Columns

In [6]:
df_test.columns

Index(['first', 'last', 'email'], dtype='object')

In [7]:
df_test.columns = ["first_name", "last_name", "email"]
df_test

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [8]:
df_test.columns = [x.upper() for x in df_test.columns]
df_test

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [9]:
df_test.columns = [x.replace("_", " ") for x in df_test.columns]
df_test

Unnamed: 0,FIRST NAME,LAST NAME,EMAIL
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [10]:
type(df_test.columns)

pandas.core.indexes.base.Index

In [11]:
df_test.columns = [x.lower() for x in df_test.columns]
df_test

Unnamed: 0,first name,last name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [12]:
df_test.columns = ["first_name", "last_name", "email"]
df_test

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [13]:
df_test.rename(columns = {"first_name": "first", "last_name": "last"})

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [14]:
df_test

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [15]:
df_test.rename( columns = {"first_name": "first", "last_name": "last"}, inplace = True)
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


## Updating Rows

In [16]:
df_test.loc[2]

first                 John
last                   Doe
email    JohnDoe@gmail.com
Name: 2, dtype: object

In [17]:
df_test.loc[2] = ["John", "Smith", "JohnSmith@gmail.com"]
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Smith,JohnSmith@gmail.com


In [18]:
df_test.loc[2, ["last", "email"]] = ["Doe", "JohnDoe@gmail.com"]
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [19]:
filt = (df_test.loc[:, "email"] == "JohnDoe@gmail.com")
df_test.loc[filt]

Unnamed: 0,first,last,email
2,John,Doe,JohnDoe@gmail.com


In [20]:
df_test.loc[filt]["last"] = "Smith"
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[filt]["last"] = "Smith"


Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [21]:
df_test.loc[filt, "last"]

2    Doe
Name: last, dtype: object

In [22]:
df_test.loc[filt, "last"] = "Smith"
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Smith,JohnDoe@gmail.com


In [23]:
df_test["email"] = [x.lower() for x in df_test["email"]]
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Smith,johndoe@gmail.com


#### Methods For Updating Rows

4 methods:
1. apply()
2. map()
3. applymap()
4. replace()

#### 1. apply()

-> using apply() on a series we can apply a specific or custom function to every value in our series

In [24]:
# Let's find out the length of every email in the email address series using the inbuilt len() function 

df_test["email"].apply(len) # we are applying the apply() function on the series df_test["email"]

0    23
1    17
2    17
Name: email, dtype: int64

In [25]:
def update_email(email):
    return email.upper()

df_test["email"].apply(update_email)

0    COREYMSCHAFER@GMAIL.COM
1          JANEDOE@GMAIL.COM
2          JOHNDOE@GMAIL.COM
Name: email, dtype: object

In [26]:
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Smith,johndoe@gmail.com


-> <b>Now</b> using apply on the whole dataframe as opposed to only rows

In [27]:
df_test.apply(len)

first    3
last     3
email    3
dtype: int64

<b>explanation:</b> Now what happened is it applied the len() function to every series in the dataframe(2d array) i.e. every 1d array,
when we applied len() to a series(1d array) it applied the len() to every element i.e. 0d array,

so when we apply the len() to the whole dataframe we apply the value to every series in the dataframe i.e. every column in the dataframe. 'first' column has a length of three i.e. corey, jane, john and 'last' column has a length of three i.e. schafer, doe, doe etc.

In [28]:
# it is similar to
print(len(df_test["first"]))
print(len(df_test["last"]))
print(len(df_test["email"]))

3
3
3


In [29]:
df_test.apply(len, axis=1)

0    3
1    3
2    3
dtype: int64

## IMP Note on Axes:
-> <i><b>axis=0</b></i> is <i><b>vertically downward</b></i> called 'rows'----since we are going through rows<br>
-> <i><b>axis=1</b></i> is <i><b>horizontally forward</b></i> called 'columns'----since we are going through columns<br>

By default the axis is 0 or "rows" or vertically downward

In [30]:
print(df_test.apply(len))
print(df_test.apply(len, axis=0))

first    3
last     3
email    3
dtype: int64
first    3
last     3
email    3
dtype: int64


Similarly we can apply numpy functions to the dataframe, all the elements in the dataframe

#### 2. applymap()

-> applymap() is used on the entire dataframe to every single individual element, we can apply a specific or custom function to every value in our dataframe

In [31]:
df_test.applymap(len)

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,5,17


In [32]:
df_test.applymap(str.lower) # since every element in our dataframe is in lower case we can do str.lower() 
# on the entire dataframe, but if our dataframe would've contained any numeric values it would give an error.

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@gmail.com
1,jane,doe,janedoe@gmail.com
2,john,smith,johndoe@gmail.com


#### 3. map()

-> map() is used on a series we can replace the value with a provided value but all the other not-provided values are turned to 'NaN' values

In [33]:
df_test["first"].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

#### 4. replace()

-> replace() is used on a series we can replace the value with a provided value.

In [34]:
df_test["first"].replace({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2     John
Name: first, dtype: object

In [35]:
df_test

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Smith,johndoe@gmail.com


### Add or Remove rows and columns

#### Adding Columns

In [36]:
df_test['first'] + ' ' + df_test['last']

0    Corey Schafer
1         Jane Doe
2       John Smith
dtype: object

In [37]:
df_test['full_name'] = df_test['first'] + ' ' + df_test['last'] # added a new column
df_test

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,coreymschafer@gmail.com,Corey Schafer
1,Jane,Doe,janedoe@gmail.com,Jane Doe
2,John,Smith,johndoe@gmail.com,John Smith


#### Removing Columns

In [38]:
df_test.drop(columns=['first', 'email'])

Unnamed: 0,last,full_name
0,Schafer,Corey Schafer
1,Doe,Jane Doe
2,Smith,John Smith


In [39]:
df_test

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,coreymschafer@gmail.com,Corey Schafer
1,Jane,Doe,janedoe@gmail.com,Jane Doe
2,John,Smith,johndoe@gmail.com,John Smith


In [40]:
df_test.drop(columns=['first', 'last'], inplace=True)
df_test

Unnamed: 0,email,full_name
0,coreymschafer@gmail.com,Corey Schafer
1,janedoe@gmail.com,Jane Doe
2,johndoe@gmail.com,John Smith


In [41]:
df_test['full_name'].str.split(" ")

0    [Corey, Schafer]
1         [Jane, Doe]
2       [John, Smith]
Name: full_name, dtype: object

In [42]:
df_test['full_name'].str.split(" ", expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Smith


In [43]:
df_test[['first', 'last']] = df_test['full_name'].str.split(" ", expand=True) # adding multiple columns to df_test
df_test

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith


#### Adding rows

to df_test.append(arg1, arg_2_to_n)------> arg1 can be a dict object or a dataframe object.

In [44]:
df_test.append({'first': 'Tony'}, ignore_index=True)

  df_test.append({'first': 'Tony'}, ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith
3,,,Tony,


In [45]:
people = {
    'first': ['Tony', 'Steve'], 
    'last': ['Stark', 'Rogers'], 
    'email': ['IronMan@avenge.com', 'Cap@avenge.com']
}
df_test2 = pd.DataFrame(people)

In [46]:
df_test2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [47]:
df_test.append(df_test2, ignore_index=True)

  df_test.append(df_test2, ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [48]:
df_test

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith


In [49]:
df_test = df_test.append(df_test2, ignore_index=True)
df_test

  df_test = df_test.append(df_test2, ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


#### Removing rows

we use drop()-> same as removing columns except we provide an index=num for the index we want to drop

In [50]:
df_test.drop(index=4)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark


In [51]:
df_test

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [52]:
df_test.drop(index=df_test[ df_test['last'] == 'Doe' ].index)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
2,johndoe@gmail.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [53]:
df_test

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [54]:
filt = (df_test["email"] == "johndoe@gmail.com")
df_test[filt]

Unnamed: 0,email,full_name,first,last
2,johndoe@gmail.com,John Smith,John,Smith


In [60]:
df_test.loc[df_test[filt].index, ["last","full_name"]] = ["Doe", "John Doe"]

In [61]:
df_test

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@gmail.com,Jane Doe,Jane,Doe
2,johndoe@gmail.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [62]:
filt = (df_test['last'] == "Doe")
df_test.drop(df_test[filt].index)

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers
