Pandas - DataFrame

In [1]:
import numpy as np
import pandas as pd

In [2]:
# create DataFrame by specifying columns and values
df_deneme = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20230101"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [3]:
df_deneme

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-01-01,1.0,3,test,foo
1,1.0,2023-01-01,1.0,3,train,foo
2,1.0,2023-01-01,1.0,3,test,foo
3,1.0,2023-01-01,1.0,3,train,foo


In [4]:
df_deneme.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [5]:
df_deneme.info() # to get the info of the DataFrame

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype        
---  ------  --------------  -----        
 0   A       4 non-null      float64      
 1   B       4 non-null      datetime64[s]
 2   C       4 non-null      float32      
 3   D       4 non-null      int32        
 4   E       4 non-null      category     
 5   F       4 non-null      object       
dtypes: category(1), datetime64[s](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


In [6]:
# another example - g7 countries
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [7]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [8]:
# to reassign the index
df.index = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States']
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [9]:
df.columns # to get the columns names

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [10]:
df.index # to get the index names

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [11]:
df.info() # to get the info of the dataframe

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 636.0+ bytes


In [12]:
df.describe() # to get the description of the dataframe - statistics

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [13]:
df.describe().T # to get the description of the dataframe - statistics - transposed

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Population,7.0,107.3026,97.24997,35.467,62.308,64.511,104.0005,318.523
GDP,7.0,5080248.0,5494020.0,1785387.0,2500716.0,2950039.0,4238402.0,17348080.0
Surface Area,7.0,3061327.0,4576187.0,242495.0,329225.0,377930.0,5082873.0,9984670.0
HDI,7.0,0.9004286,0.01659174,0.873,0.8895,0.907,0.914,0.916


In [14]:
# Get the number of rows (and columns)
rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")

Rows: 7, Columns: 5


In [15]:
# Get total elements in the DataFrame
total_elements = df.size
print(f"Total elements: {total_elements}")

Total elements: 35


In [16]:
# Get the number of rows (axis=0) or columns (axis=1)
n_rows = len(df)
n_cols = len(df.columns)
print(f"Number of rows: {n_rows}")
print(f"Number of columns: {n_cols}")

Number of rows: 7
Number of columns: 5


In [17]:
# .size is actually an attribute, not a function. Therefore, you shouldn't use parentheses when accessing it.

df.size # to get the size of the dataframe

35

In [18]:
df.shape # to get the shape of the dataframe

(7, 5)

In [19]:
df.dtypes # to get the data types of the columns
  

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [20]:
df.dtypes.value_counts() # to get the count of each data type in the columns

float64    2
int64      2
object     1
Name: count, dtype: int64

indexing, slicing and selectioning data in a DataFrame


In [21]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


| Feature  | `.loc[]`       | `.iloc[]`         |
| -------- | -------------- | ----------------- |
| Based on | Labels (names) | Integer positions |
| Slicing  | Inclusive      | Exclusive         |
| Use case | Known labels   | Known positions   |


In [22]:
# Individual columns in the DataFrame can be selected with regular indexing:

df.loc["Canada"] # select row by label

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [23]:
df.iloc[0] # select row by integer location

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [24]:
df["Population"] # select column by label

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [25]:
df.Population # select column by label

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [26]:
df["Population"].to_frame() # convert the series to a dataframe

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [27]:
df[["Population", "GDP"]] # select multiple columns by label

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [33]:
df.loc["France"][["Population","GDP"]] # select multiple columns by label and row by label

Population     63.951
GDP           2833687
Name: France, dtype: object

In [34]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [35]:
df.loc["France":"Italy", "Population":"GDP"] # select multiple columns by label and multiple rows by label

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [39]:
# iloc works the same way as loc, but it uses integer positions instead of labels
df.iloc[0] # select row by integer location, and 0 is the first row

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [40]:
df.iloc[-1] # select last row by integer location, and -1 is the last row


Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: United States, dtype: object

In [41]:
df.iloc[[0, 1, -1]] # select multiple rows by integer location]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America


In [50]:
df.iloc[1:3] # select multiple rows by integer location by slicing

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [49]:
df.iloc[1:3, 0:2] # select multiple rows and columns by integer location by slicing

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437


In [48]:
df.iloc[1:3, [0, 1]] # select multiple rows and columns by integer location giving a list of columns

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437


In [54]:
# CONDITIONAL SELECTION

df[df["Population"] > 100] # select rows where the population is greater than 100


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [None]:
df[df["Population"] > 100][["Population", "GDP"]] # select rows where the population is greater than 100 and select multiple columns, by label


Unnamed: 0,Population,GDP
Japan,127.061,4602367
United States,318.523,17348075


In [58]:
df[(df["Population"] > 80) & (df["GDP"] > 1000)] # select rows where the population is greater than 80 and the GDP is greater than 1000

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.loc[df["Population"] > 80] # select rows where the population is greater than 80 

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [60]:
df.loc[df["Population"] > 80, ["Population", "GDP"]] # select rows where the population is greater than 80 and select multiple columns, by label

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


In [61]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [63]:
#drop values
df.drop("Italy") # drop a row by label

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [65]:
df.drop("HDI", axis=1) # drop a column by label)

Unnamed: 0,Population,GDP,Surface Area,Continent
Canada,35.467,1785387,9984670,America
France,63.951,2833687,640679,Europe
Germany,80.94,3874437,357114,Europe
Italy,60.665,2167744,301336,Europe
Japan,127.061,4602367,377930,Asia
United Kingdom,64.511,2950039,242495,Europe
United States,318.523,17348075,9525067,America


In [66]:
df.drop(columns="Continent") # drop a column by label")

Unnamed: 0,Population,GDP,Surface Area,HDI
Canada,35.467,1785387,9984670,0.913
France,63.951,2833687,640679,0.888
Germany,80.94,3874437,357114,0.916
Italy,60.665,2167744,301336,0.873
Japan,127.061,4602367,377930,0.891
United Kingdom,64.511,2950039,242495,0.907
United States,318.523,17348075,9525067,0.915


In [67]:
df.drop(["Italy", "United States"], axis = 0) # drop multiple rows by label using axis = 0, row wise

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe


In [68]:
df.drop(["Population", "GDP"], axis = 1) # drop multiple columns by label using axis = 1, column wise

Unnamed: 0,Surface Area,HDI,Continent
Canada,9984670,0.913,America
France,640679,0.888,Europe
Germany,357114,0.916,Europe
Italy,301336,0.873,Europe
Japan,377930,0.891,Asia
United Kingdom,242495,0.907,Europe
United States,9525067,0.915,America


In [69]:
df.drop(["Population", "GDP"], axis = "columns") # drop multiple columns by label using axis = "columns", column wise


Unnamed: 0,Surface Area,HDI,Continent
Canada,9984670,0.913,America
France,640679,0.888,Europe
Germany,357114,0.916,Europe
Italy,301336,0.873,Europe
Japan,377930,0.891,Asia
United Kingdom,242495,0.907,Europe
United States,9525067,0.915,America


In [70]:
df.drop(["France", "Italy"], axis = 0) # drop multiple rows by label using axis = 0, row wise 

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [71]:
df.drop(["France", "Italy"], axis = "index") # drop multiple rows by label using axis = "index", row wise

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [72]:
#OPERATIONS ON DATAFRAMES

In [73]:
df[["Population", "GDP"]] # select multiple columns, by label

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [74]:
df[["Population", "GDP"]] / 100 # divide all values in the selected columns by 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [77]:
op = pd.Series([-100, -1000000], index=["Population", "GDP"]) # create a series with the same index as the dataframe

In [76]:
df[["Population", "GDP"]] 

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [79]:
df[["Population", "GDP"]]  + op # subtract the series from the dataframe

Unnamed: 0,Population,GDP
Canada,-64.533,785387
France,-36.049,1833687
Germany,-19.06,2874437
Italy,-39.335,1167744
Japan,27.061,3602367
United Kingdom,-35.489,1950039
United States,218.523,16348075


In [80]:
# modify data frame , add a new column

df["GDP per capita"] = df["GDP"] / df["Population"] # add a new column to the dataframe


In [81]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,50339.385908
France,63.951,2833687,640679,0.888,Europe,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,36221.712406
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975
United States,318.523,17348075,9525067,0.915,America,54464.12033


In [86]:
lang = pd.Series(["English", "French", "German", "English", "Italian"], index=["United States", "France", "Germany", "Canada", "Italy"])  # create a series with the same index as the dataframe

In [87]:
df["Language"] = lang # add a new column to the dataframe

In [88]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975,
United States,318.523,17348075,9525067,0.915,America,54464.12033,English


In [93]:
df.loc["Japan","Language"] = "Japanese"  # modify a single value row by label in the dataframe, for specific column, Language

In [94]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975,
United States,318.523,17348075,9525067,0.915,America,54464.12033,English


In [95]:
df.loc["United Kingdom", "Language"] = "English" # modify a single value row by label in the dataframe, for specific column, Language

In [96]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975,English
United States,318.523,17348075,9525067,0.915,America,54464.12033,English


In [97]:
#renaming columns

df.rename(columns={"HDI": "Human Development Index"}) # rename a column in the dataframe

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975,English
United States,318.523,17348075,9525067,0.915,America,54464.12033,English


In [144]:
df_copy = df.copy() # create a copy of the dataframe

In [145]:
df_copy.rename(columns={"HDI": "Human Development Index"}, inplace=True) # rename a column in the dataframe, inplace=True means that the dataframe is modified in place

In [146]:
df_copy

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975,English
United States,318.523,17348075,9525067,0.915,America,54464.12033,English


In [147]:
#renaming index
df_copy.rename(index={"United States": "USA", "United Kingdom": "UK"}, inplace=True) # rename a row in the dataframe, inplace=True means that the dataframe is modified in place

In [148]:
df_copy

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067,0.915,America,54464.12033,English


In [149]:
df_copy.rename(index=str.upper) # rename the rows in the dataframe, str.upper means that the rows are  renamed to uppercase

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
CANADA,35.467,1785387,9984670,0.913,America,50339.385908,English
FRANCE,63.951,2833687,640679,0.888,Europe,44310.284437,French
GERMANY,80.94,3874437,357114,0.916,Europe,47868.013343,German
ITALY,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
JAPAN,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067,0.915,America,54464.12033,English


In [150]:
df_copy.rename(index=lambda x: x.upper())  # rename the rows in the dataframe, lambda x: x.upper() means that the rows are renamed to uppercase

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
CANADA,35.467,1785387,9984670,0.913,America,50339.385908,English
FRANCE,63.951,2833687,640679,0.888,Europe,44310.284437,French
GERMANY,80.94,3874437,357114,0.916,Europe,47868.013343,German
ITALY,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
JAPAN,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067,0.915,America,54464.12033,English


In [151]:
df_copy.drop(columns="Continent") # drop a column in the dataframe

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,50339.385908,English
France,63.951,2833687,640679,0.888,44310.284437,French
Germany,80.94,3874437,357114,0.916,47868.013343,German
Italy,60.665,2167744,301336,0.873,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,36221.712406,Japanese
UK,64.511,2950039,242495,0.907,45729.239975,English
USA,318.523,17348075,9525067,0.915,54464.12033,English


In [152]:
df_copy.drop(index="USA") # drop a row in the dataframe

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495,0.907,Europe,45729.239975,English


In [153]:
df_copy.loc["China"] = {"Population": 1500, "GDP": 15000000, "Human Development Index": 0.9} # adding a new row value to the dataframe using loc method

In [154]:
df_copy

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670.0,0.913,America,50339.385908,English
France,63.951,2833687,640679.0,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114.0,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336.0,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930.0,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495.0,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067.0,0.915,America,54464.12033,English
China,1500.0,15000000,,0.9,,,


In [155]:
#pd.concat
new_row = pd.DataFrame({"Population": 86, "GDP": 1100000, "Human Development Index": 0.9}, index=["Turkey"]) # creating a new row using pd.DataFrame method
df_copy = pd.concat([df_copy, new_row]) # adding the new row to the dataframe using pd.concat method

In [156]:
df_copy

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670.0,0.913,America,50339.385908,English
France,63.951,2833687,640679.0,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114.0,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336.0,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930.0,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495.0,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067.0,0.915,America,54464.12033,English
China,1500.0,15000000,,0.9,,,
Turkey,86.0,1100000,,0.9,,,


In [157]:
gdp_per_capita = df_copy.loc["China", "GDP"] / df_copy.loc["China", "Population"] # calculating the GDP per capita for China

gdp_per_capita

10000.0

In [158]:
df_copy.loc["China",["Continent", "GDP per capita"]] = ["Asia", gdp_per_capita] # adding a new row value to the dataframe using loc method

In [159]:
df_copy

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670.0,0.913,America,50339.385908,English
France,63.951,2833687,640679.0,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114.0,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336.0,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930.0,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495.0,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067.0,0.915,America,54464.12033,English
China,1500.0,15000000,,0.9,Asia,10000.0,
Turkey,86.0,1100000,,0.9,,,


In [160]:
gdp_per_capita_tr = df_copy.loc["Turkey", "GDP"] / df_copy.loc["Turkey", "Population"] # calculating the GDP per capita for Turkey

In [162]:

df_copy.loc["Turkey", ["Continent", "GDP per capita"]] = ["Europe", gdp_per_capita_tr] # adding a new row value to the dataframe using loc method

In [163]:
df_copy

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670.0,0.913,America,50339.385908,English
France,63.951,2833687,640679.0,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114.0,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336.0,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930.0,0.891,Asia,36221.712406,Japanese
UK,64.511,2950039,242495.0,0.907,Europe,45729.239975,English
USA,318.523,17348075,9525067.0,0.915,America,54464.12033,English
China,1500.0,15000000,,0.9,Asia,10000.0,
Turkey,86.0,1100000,,0.9,Europe,12790.697674,


In [164]:
# STATISTICAL INFORMATION


In [167]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita,Language
Canada,35.467,1785387,9984670,0.913,America,50339.385908,English
France,63.951,2833687,640679,0.888,Europe,44310.284437,French
Germany,80.94,3874437,357114,0.916,Europe,47868.013343,German
Italy,60.665,2167744,301336,0.873,Europe,35733.025633,Italian
Japan,127.061,4602367,377930,0.891,Asia,36221.712406,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,45729.239975,English
United States,318.523,17348075,9525067,0.915,America,54464.12033,English


In [168]:
df.describe() # statistical information about the dataframe

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP per capita
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429,44952.254576
std,97.24997,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,62.308,2500716.0,329225.0,0.8895,40265.998421
50%,64.511,2950039.0,377930.0,0.907,45729.239975
75%,104.0005,4238402.0,5082873.0,0.914,49103.699626
max,318.523,17348080.0,9984670.0,0.916,54464.12033


In [None]:
df["Population"].max() # maximum population

318.523

In [173]:
df["Population"].min() # minimum population


35.467

In [174]:
df["Population"].sum() # sum of the population

751.118

In [172]:
df["Population"].sum() /len(df["Population"]) # calculating the average population

107.30257142857144

In [175]:
df["Population"].mean()

107.30257142857144

In [176]:
df["Population"].median() # median of the population

64.511

In [180]:
df["Population"].describe()

count      7.000000
mean     107.302571
std       97.249970
min       35.467000
25%       62.308000
50%       64.511000
75%      104.000500
max      318.523000
Name: Population, dtype: float64

In [181]:
df["Population"].quantile(0.25) # 25th percentile

62.308

In [182]:
df["Population"].quantile(0.5) # 50th percentile

64.511

In [183]:
df["Population"].quantile(0.75) # 75th percentile

104.0005

In [184]:
df["Population"].quantile(0.9) # 90th percentile

203.64580000000007

In [185]:
df["Population"].quantile([0.25, 0.5, 0.75, 0.9]) # multiple percentiles

0.25     62.3080
0.50     64.5110
0.75    104.0005
0.90    203.6458
Name: Population, dtype: float64