# Pandas

In [2]:
# Install pandas
!pip install pandas



In [4]:
# import pandas
import pandas as pd

### Series

A one dimensional array but with an index

In [9]:
# Series sample
sample_data = [10, 10, 15, 26, 45, 27]

sample_series = pd.Series(sample_data)
sample_series

0    10
1    10
2    15
3    26
4    45
5    27
dtype: int64

In [19]:
# Series sample

Artists = ['Nath', 'Dunsin', 'Tope']
Songs = ['Hallelujah', 'Breathe', 'Awa Gbe O ga']

sample_ser = pd.Series(Songs, index=Artists, name='Gospel Artists')
to_dataFrame = sample_ser.to_frame()
print(to_dataFrame)

       Gospel Artists
Nath       Hallelujah
Dunsin        Breathe
Tope     Awa Gbe O ga


### DataFrame

A two-dimensional labelled data structure, similar to Excel Sheet or SQL table

In [21]:
# DataFrame from a Dictionary

dict_data = {
    'Name': ['John', 'Rhema', 'Robert'],
    'Job Title': ['Data Analyst', 'DevOps Engineer', 'Scrum Master'],
    'Salary': [120000, 150000, 180000]
}
df_sample = pd.DataFrame(dict_data)
df_sample

Unnamed: 0,Name,Job Title,Salary
0,John,Data Analyst,120000
1,Rhema,DevOps Engineer,150000
2,Robert,Scrum Master,180000


In [25]:
# DataFrame from a list of tuples

tuple_data = [('Arsenal', 7, 19), ('Chelsea', 7, 18), ('Manchester United', 8, 15), ('Burnley', 8, 12)]
df_sample = pd.DataFrame(tuple_data, columns=['Club Name', 'Games Played', 'Points'])
df_sample

Unnamed: 0,Club Name,Games Played,Points
0,Arsenal,7,19
1,Chelsea,7,18
2,Manchester United,8,15
3,Burnley,8,12


In [29]:
# DataFrame from a list

courses = ['Data Science', 'Cyber Security', 'Software Engineering', 'UI/UX']
prices = [150000, 200000, 250000, 100000]
duration = ['6 Months', '9 Months', '12 Months', '3 Months']

list_data = pd.DataFrame({
    'Courses': courses,
    'Course Price': prices,
    'Course Duration': duration
})
list_data

Unnamed: 0,Courses,Course Price,Course Duration
0,Data Science,150000,6 Months
1,Cyber Security,200000,9 Months
2,Software Engineering,250000,12 Months
3,UI/UX,100000,3 Months


#### Basic Operations on DataFrames and Series

In [32]:
# head function by default lists the first 5 data except specifying the number needed
list_data.head(2)

Unnamed: 0,Courses,Course Price,Course Duration
0,Data Science,150000,6 Months
1,Cyber Security,200000,9 Months


In [36]:
# tail function lists the the last 5 data except specifying the number needed
list_data.tail(2)

Unnamed: 0,Courses,Course Price,Course Duration
2,Software Engineering,250000,12 Months
3,UI/UX,100000,3 Months


In [38]:
# Get data by column name
list_data['Courses']

0            Data Science
1          Cyber Security
2    Software Engineering
3                   UI/UX
Name: Courses, dtype: object

In [42]:
# Adding a new column
list_data['Cheapest'] = list_data['Course Price'] < 200000
list_data

Unnamed: 0,Courses,Course Price,Course Duration,Cheapest
0,Data Science,150000,6 Months,True
1,Cyber Security,200000,9 Months,False
2,Software Engineering,250000,12 Months,False
3,UI/UX,100000,3 Months,True


In [50]:
# Analyse data to get only the expensive courses
Expensive = list_data[list_data['Course Price'] >= 200000]
Expensive

Unnamed: 0,Courses,Course Price,Course Duration,Cheapest
1,Cyber Security,200000,9 Months,False
2,Software Engineering,250000,12 Months,False


### Data Importing and Exporting

#### Importing Data

In [59]:
# Importing Data from the same folder with the notebook
# The read() function is used to import datasets depending on the dataset extension such as .csv, .tsv, .xlxs and much more

locations_data = pd.read_csv('datasets/locations.csv')
locations_data.head()

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
0,Afghanistan,Asia,38928341.0,64.83,0.5,1803.987
1,Albania,Europe,2877800.0,78.57,2.89,11803.431
2,Algeria,Africa,43851043.0,76.88,1.9,13913.839
3,Andorra,Europe,77265.0,83.73,,
4,Angola,Africa,32866268.0,61.15,,5819.495


In [63]:
# Importing directly online using a url
url_data = "https://raw.githubusercontent.com/Oyeniran20/Machine-Learning/main/6.%20Trees/housing.csv"
url_data

housing_data = pd.read_csv(url_data)
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [77]:
# Importing a .json extension file

# This dataset is not clean
phones_data = pd.read_json('datasets/phones.json')
phones_data

phones_data.head()

Unnamed: 0,phone_brand,phone_model,price,specs,pricing
0,itel,itel Smart Watch 1,,"{'Network': {'2G bands': ' N/A', '3G bands': '...",
1,oukitel,Oukitel WP19,About 380 EUR,{'Network': {'2G bands': 'GSM 850 / 900 / 1800...,
2,cubot,Cubot Smart Watch,,"{'Network': {'2G bands': ' N/A', '3G bands': '...",
3,cubot,Cubot ID206,,"{'Network': {'2G bands': ' N/A', '3G bands': '...",
4,tcl,TCL Plex,About 330 EUR,{'Network': {'2G bands': 'GSM 850 / 900 / 1800...,


In [75]:
# this is the clean data in .csv extension
clean_phone_data = pd.read_csv('datasets/phones_processed.csv')
clean_phone_data

clean_phone_data.head()

Unnamed: 0,phone_brand,phone_model,store,price_usd,storage,ram,launch_date,dimensions,weight,display_type,...,price_range,os_type,os_version,battery_size,colors_available,chip_company,cpu_core,gpu_company,fingerprint,video_resolution
0,apple,Apple iPhone 16 Pro,Amazon DE,1357.55,256,8,2024-09-20,149.6 x 71.5 x 8.3 mm (5.89 x 2.81 x 0.33 in),199.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,medium price,iOS,18.0,Medium,4,Apple,Hexa-core,Apple,Face,4K
1,apple,Apple iPhone 16 Pro,Amazon DE,1492.55,512,8,2024-09-20,149.6 x 71.5 x 8.3 mm (5.89 x 2.81 x 0.33 in),199.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,high price,iOS,18.0,Medium,4,Apple,Hexa-core,Apple,Face,4K
2,apple,Apple iPhone 16 Pro,Amazon DE,1705.32,1000,8,2024-09-20,149.6 x 71.5 x 8.3 mm (5.89 x 2.81 x 0.33 in),199.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,high price,iOS,18.0,Medium,4,Apple,Hexa-core,Apple,Face,4K
3,apple,Apple iPhone 16 Pro Max,Amazon DE,1564.92,512,8,2024-09-20,163 x 77.6 x 8.3 mm (6.42 x 3.06 x 0.33 in),227.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,high price,iOS,18.0,Large,4,Apple,Hexa-core,Apple,Face,4K
4,apple,Apple iPhone 12 mini,Amazon DE,247.32,128,4,2020-11-13,131.5 x 64.2 x 7.4 mm (5.18 x 2.53 x 0.29 in),135.0,"Super Retina XDR OLED, HDR10, Dolby Vision, 62...",...,medium price,iOS,14.1,Small,6,Apple,Hexa-core,Apple,Face,4K


In [83]:
# Importing a .sqlite extension file
import sqlite3
print(sqlite3.sqlite_version)

3.45.3


In [105]:
# Connect to the database
db_conn = sqlite3.connect('datasets/wildfires.sqlite')

# Query to list only tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, db_conn)

# Display the table names
print("Tables in the database:")
print(tables)

db_conn.close()

Tables in the database:
                                  name
0                      spatial_ref_sys
1                   spatialite_history
2                      sqlite_sequence
3                     geometry_columns
4                  spatial_ref_sys_aux
5               views_geometry_columns
6               virts_geometry_columns
7          geometry_columns_statistics
8    views_geometry_columns_statistics
9    virts_geometry_columns_statistics
10        geometry_columns_field_infos
11  views_geometry_columns_field_infos
12  virts_geometry_columns_field_infos
13               geometry_columns_time
14               geometry_columns_auth
15         views_geometry_columns_auth
16         virts_geometry_columns_auth
17                  sql_statements_log
18                        SpatialIndex
19                ElementaryGeometries
20                                 KNN
21                               Fires
22                     idx_Fires_Shape
23                idx_Fires_Shape_node
2

In [111]:
# Path to the SpatiaLite library
spatialite_lib = "/usr/local/Cellar/libspatialite/5.1.0_1/lib/mod_spatialite.dylib"

# Connect to the SQLite database
db_conn = sqlite3.connect('datasets/wildfires.sqlite')

# Enable loading of extensions
db_conn.enable_load_extension(True)

# Load the SpatiaLite extension
try:
    db_conn.execute(f"SELECT load_extension('{spatialite_lib}');")
    print("SpatiaLite extension loaded successfully.")
except Exception as e:
    print(f"Failed to load SpatiaLite extension: {e}")

# Get the list of tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, db_conn)

# Loop through each table and display its content
for table_name in tables['name']:
    print(f"\n--- Data from table: {table_name} ---")
    try:
        # Fetch first 5 rows from the table
        table_data = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 5;", db_conn)
        print(table_data)
    except Exception as e:
        print(f"Could not fetch data from table {table_name}: {e}")

db_conn.close()

SpatiaLite extension loaded successfully.

--- Data from table: spatial_ref_sys ---
   srid auth_name  auth_srid                              ref_sys_name  \
0    -1      NONE         -1                     Undefined - Cartesian   
1     0      NONE          0           Undefined - Geographic Long/Lat   
2  2000      epsg       2000  Anguilla 1957 / British West Indies Grid   
3  2001      epsg       2001   Antigua 1943 / British West Indies Grid   
4  2002      epsg       2002  Dominica 1945 / British West Indies Grid   

                                           proj4text  \
0                                                      
1                                                      
2  +proj=tmerc +lat_0=0 +lon_0=-62 +k=0.999500000...   
3  +proj=tmerc +lat_0=0 +lon_0=-62 +k=0.999500000...   
4  +proj=tmerc +lat_0=0 +lon_0=-62 +k=0.999500000...   

                                              srtext  
0                                          Undefined  
1                       

In [113]:
# Importing a .tsv extension file

movies_data = pd.read_csv('datasets/movie_titles_metadata.tsv', sep='\t')
movies_data

movies_data.head()

Unnamed: 0,m0,10 things i hate about you,1999,6.90,62847,['comedy' 'romance']
0,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
2,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
3,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
4,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...


In [119]:
# Naming the columns
col_name = ['sn', 'name', 'release_year', 'ratings','voting', 'genre']
movies_data = pd.read_csv('datasets/movie_titles_metadata.tsv', sep='\t', names=col_name)
movies_data

movies_data.tail()

Unnamed: 0,sn,name,release_year,ratings,voting,genre
612,m612,watchmen,2009,7.8,135229.0,['action' 'crime' 'fantasy' 'mystery' 'sci-fi'...
613,m613,xxx,2002,5.6,53505.0,['action' 'adventure' 'crime']
614,m614,x-men,2000,7.4,122149.0,['action' 'sci-fi']
615,m615,young frankenstein,1974,8.0,57618.0,['comedy' 'sci-fi']
616,m616,zulu dawn,1979,6.4,1911.0,['action' 'adventure' 'drama' 'history' 'war']


#### Functions and attributes

In [128]:
# Determine the shape of a dataset
movies_data.shape

(617, 6)

In [133]:
# Get all info on the dataset(DataFrame)
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sn            617 non-null    object 
 1   name          616 non-null    object 
 2   release_year  616 non-null    object 
 3   ratings       616 non-null    float64
 4   voting        616 non-null    float64
 5   genre         616 non-null    object 
dtypes: float64(2), object(4)
memory usage: 29.1+ KB


#### Exporting Data

In [137]:
# Converting .json file to .csv file

phones_data.to_csv('phones_convert.csv')

In [141]:
# Exporting .sqlite extension file to xlsx

# Connect to the SQLite database
db_conn = sqlite3.connect('datasets/wildfires.sqlite')

# Load the SpatiaLite extension
db_conn.enable_load_extension(True)
db_conn.load_extension('/usr/local/Cellar/libspatialite/5.1.0_1/lib/mod_spatialite.dylib')  # Path to mod_spatialite.dylib

# Get the list of tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, db_conn)

# Loop through each table and export to Excel
for table_name in tables['name']:
    print(f"Exporting data from table: {table_name}...")
    try:
        # Fetch all data from the table
        table_data = pd.read_sql(f"SELECT * FROM {table_name};", db_conn)
        
        # Define Excel filename (you can customize this)
        excel_filename = f"{table_name}.xlsx"
        
        # Export to Excel
        table_data.to_excel(excel_filename, index=False)
        print(f"Data exported to {excel_filename}")
    except Exception as e:
        print(f"Could not fetch data from table {table_name}: {e}")

# Close the database connection
db_conn.close()

Exporting data from table: spatial_ref_sys...
Data exported to spatial_ref_sys.xlsx
Exporting data from table: spatialite_history...
Data exported to spatialite_history.xlsx
Exporting data from table: sqlite_sequence...
Data exported to sqlite_sequence.xlsx
Exporting data from table: geometry_columns...
Data exported to geometry_columns.xlsx
Exporting data from table: spatial_ref_sys_aux...
Data exported to spatial_ref_sys_aux.xlsx
Exporting data from table: views_geometry_columns...
Data exported to views_geometry_columns.xlsx
Exporting data from table: virts_geometry_columns...
Data exported to virts_geometry_columns.xlsx
Exporting data from table: geometry_columns_statistics...
Data exported to geometry_columns_statistics.xlsx
Exporting data from table: views_geometry_columns_statistics...
Data exported to views_geometry_columns_statistics.xlsx
Exporting data from table: virts_geometry_columns_statistics...
Data exported to virts_geometry_columns_statistics.xlsx
Exporting data from 