In [97]:
import polars as pl
import plotly.express as px
from datetime import datetime

Read the csv file

In [98]:
data = pl.read_csv('amazon.csv', ignore_errors=True)

Check data type of each column


In [99]:
data.head(1)

year,state,month,number,date
i64,str,str,i64,str
1998,"""Acre""","""Janeiro""",0,"""1998-01-01"""


Convert date to datetime

Display top 5 rows

In [100]:
date = data.with_columns(
    pl.col('date').str.strptime(pl.Date, fmt='%Y-%m-%d', strict=False)
)
date.head(5)

year,state,month,number,date
i64,str,str,i64,date
1998,"""Acre""","""Janeiro""",0,1998-01-01
1999,"""Acre""","""Janeiro""",0,1999-01-01
2000,"""Acre""","""Janeiro""",0,2000-01-01
2001,"""Acre""","""Janeiro""",0,2001-01-01
2002,"""Acre""","""Janeiro""",0,2002-01-01


Check the last 5 rows

In [101]:
data.tail(5)

year,state,month,number,date
i64,str,str,i64,str
2012,"""Tocantins""","""Dezembro""",128,"""2012-01-01"""
2013,"""Tocantins""","""Dezembro""",85,"""2013-01-01"""
2014,"""Tocantins""","""Dezembro""",223,"""2014-01-01"""
2015,"""Tocantins""","""Dezembro""",373,"""2015-01-01"""
2016,"""Tocantins""","""Dezembro""",119,"""2016-01-01"""


Find the shape of our dataset (number of rows and number of columns)

In [102]:
print(f'There are {data.shape[0]} rows')
print(f'There are {data.shape[1]} columns')

There are 6454 rows
There are 5 columns


Getting information about our dataset like total number rows, total number of columns, datatypes of each column and memory requirement.

In [103]:
print(data.dtypes)
print(f'Total Allocated Heap Size: {data.estimated_size()} bytes')

[Int64, Utf8, Utf8, Int64, Utf8]
Total Allocated Heap Size: 415701 bytes


Check for duplicate data and drop them

In [104]:
clean = data.unique()
print(f'Dropped {len(data) - len(clean)} duplicate rows')

Dropped 67 duplicate rows


Check null values in the dataset.

In [105]:
display(clean.null_count())

year,state,month,number,date
u32,u32,u32,u32,u32
0,0,0,789,0


Get overall statistics about the data frame

In [106]:
clean.describe()

describe,year,state,month,number,date
str,f64,str,str,f64,str
"""count""",6387.0,"""6387""","""6387""",6387.0,"""6387"""
"""null_count""",0.0,"""0""","""0""",789.0,"""0"""
"""mean""",2007.490214,,,124.304752,
"""std""",5.730372,,,200.106133,
"""min""",1998.0,"""Acre""","""Abril""",0.0,"""1998-01-01"""
"""max""",2017.0,"""Tocantins""","""Setembro""",998.0,"""2017-01-01"""
"""median""",2007.0,,,37.0,


Rename month names to english.

In [107]:
def renameMonth(month):
    match month:
        case "Junho":
            return 'June'
        case "Abril":
            return 'April'
        case "Novembro":
            return "November"
        case "Maio":
            return 'May'
        case "Mar�o":
            return 'March'
        case "Dezembro":
            return 'December'
        case "Agosto":
            return 'August'
        case "Outubro":
            return 'October'
        case 'Setembro':
            return "September"
        case "Julho":
            return 'July'
        case "Fevereiro":
            return "February"
        case "Janeiro":
            return 'January'

display(clean['month'].unique())
clean = clean.with_columns(pl.col('month').apply(lambda x: x.replace(x, renameMonth(x))))

month
str
"""Novembro"""
"""Agosto"""
"""Fevereiro"""
"""Junho"""
"""Abril"""
"""Outubro"""
"""Dezembro"""
"""Julho"""
"""Setembro"""
"""Mar�o"""


Total number of fires registered.


In [108]:
len(clean)

6387

In which month was the maximum number of forest fires reported?

a. You need to create proper stats and showing them with the help of
bar plot

In [109]:
display(px.bar(clean.groupby('month').count(), x=0, y=1))

clean.groupby('month').count().sort(by='count', descending=True)

month,count
str,u32
"""November""",539
"""June""",539
"""July""",537
"""October""",537
"""February""",535
"""January""",535
"""April""",534
"""March""",534
"""May""",533
"""August""",528


In which year was the maximum number of forest fires reported?

In [110]:
display(px.bar(clean.groupby('year').count().sort(by='count'), x=0, y=1))

clean.groupby('year').count().sort(by='count', descending=True)

year,count
i64,u32
2014,324
2000,324
2011,323
2003,323
2009,322
2012,322
2005,322
2015,322
2008,322
2010,322


Which state has the maximum number of forest fires reported?

In [111]:
display(px.bar(clean.groupby('state').count().sort(by='count'), x=0, y=1))

Find total number of fires were reported in amazonas (state of brazil)

In [112]:
amazonas = clean.filter(clean['state'] == 'Amazonas')
len(amazonas)

239

Display number of fires were reported in amazon (year-wise)

In [113]:
display(px.bar(amazonas.groupby('year').count(), x=0, y=1))
amazonas.groupby('year').count()

year,count
i64,u32
2002,12
2016,12
2003,12
2013,12
2011,12
2012,12
2014,12
1998,12
2015,12
2017,11


Display number of fires were reported in amazon (day-wise)

In [114]:
display(clean['date'].unique())
#amazonas['date'].apply(lambda x: datetime.strftime(datetime.strptime(x, '%Y-%m-%d'), '%A'))

amazonas = amazonas.with_columns(pl.col('date').apply(lambda x: datetime.strftime(datetime.strptime(x, '%Y-%m-%d'), '%A')))
display(px.bar(amazonas.groupby('date').count(), x=0, y=1))

date
str
"""2016-01-01"""
"""2006-01-01"""
"""2001-01-01"""
"""2010-01-01"""
"""2012-01-01"""
"""2008-01-01"""
"""2009-01-01"""
"""2013-01-01"""
"""2015-01-01"""
"""2005-01-01"""


Find the total number of fires reported in 2015 and visualize data based on each ‘month

In [118]:
data_2015 = clean.filter(clean['year'] == 2015)

display(px.bar(data_2015.groupby('month').count(), x=0, y=1, labels={'0' : 'Month', '1' : 'Count'}))
data_2015.groupby('month').count()

month,count
str,u32
"""September""",26
"""October""",27
"""June""",27
"""July""",27
"""February""",27
"""November""",27
"""May""",27
"""March""",27
"""April""",27
"""December""",27


Find the average number of fires reported from highest to lowest(state-wise)

In [146]:
year_num_avg = clean.groupby('year').mean()[['year', 'number']].sort('number')

px.bar(year_num_avg, x=0, y=1, labels={'0' : 'Year', '1' : 'Count'})

To find the state names where fires were reported in 'dec' month.

In [148]:
dec_month = clean.filter(clean['month'] == 'December')
dec_month['state'].unique()

state
str
"""Roraima"""
"""Piau"""
"""Alagoas"""
"""Bahia"""
"""Pernambuco"""
"""Distrito Feder…"
"""Sao Paulo"""
"""Rio"""
"""Maranhao"""
"""Amazonas"""
