# Polars Tuto 2

## Lire le dataframe parquet 

In [20]:
import polars as pl 
import pandas as pd
import sys 
df = pl.read_parquet('data/tweets.parquet.gzip')
df_pd = pd.read_csv('data/tweets.csv')

#### La taille en mémoire du dataframe

In [6]:
sys.getsizeof(df)/1024**2 # en PL

4.57763671875e-05

In [22]:
sys.getsizeof(df_pd)/1024**2 # en PD

73.74482822418213

#### Création d'une nouvelle colonne 

In [9]:
df.columns

['Unnamed: 0', 'date', 'true_sentiment', 'content_cleaned']

#### Sélection de colonnes 

In [13]:
df_filtered_pd = df_pd[['true_sentiment', 'content_cleaned']]#PD

df_filtered_pl = df.select(pl.col('true_sentiment'), pl.col('content_cleaned'))#PL

In [23]:
df_filtered_pd

true_sentiment,content_cleaned
f64,str
1.0,"""cashtag I wan…"
1.0,"""cashtag Don't…"
0.0,"""cashtag Anoth…"
1.0,"""cashtag iPhon…"
0.0,"""cashtag casht…"
…,…
1.0,"""cashtag damm …"
1.0,"""cashtag IDC J…"
1.0,"""cashtag bears…"
0.0,"""cashtag just …"


####  Filtrer des colonnes 

In [29]:
df_filtered_pd = df_pd[df_pd['true_sentiment'] == 1.0]
df_filtered_pd.head(2)

Unnamed: 0.1,Unnamed: 0,date,true_sentiment,content_cleaned
0,0,2017-05-09T20:32:06Z,1.0,cashtag I wanna sell some shares to buy me an...
1,1,2023-02-08T21:31:58Z,1.0,"cashtag Don't forget cashtag tomorrow, heade..."


In [44]:
df_filtered_pd = df.filter(pl.col('true_sentiment')==1)
df_filtered_pd.head(2)

Unnamed: 0,date,true_sentiment,content_cleaned
i64,str,i8,str
0,"""2017-05-09T20:…",1,"""cashtag I wan…"
1,"""2023-02-08T21:…",1,"""cashtag Don't…"


#### double condition 

In [55]:
df_filtered = df.filter(
    (pl.col('true_sentiment') == 1) & 
    (pl.col('content_cleaned').str.contains('Bad'))
)
df_filtered.shape

(61, 4)

#### Changement de type de variable 

In [40]:
df_pd['true_sentiment'] = df_pd['true_sentiment'].astype('int')
df_pd.head(2)

Unnamed: 0.1,Unnamed: 0,date,true_sentiment,content_cleaned
0,0,2017-05-09T20:32:06Z,1,cashtag I wanna sell some shares to buy me an...
1,1,2023-02-08T21:31:58Z,1,"cashtag Don't forget cashtag tomorrow, heade..."


In [41]:
df = df.with_columns(pl.col('true_sentiment').cast(pl.Int8))
df.head(2)

Unnamed: 0,date,true_sentiment,content_cleaned
i64,str,i8,str
0,"""2017-05-09T20:…",1,"""cashtag I wan…"
1,"""2023-02-08T21:…",1,"""cashtag Don't…"


Unnamed: 0,date,true_sentiment,content_cleaned
i64,str,i8,str
0,"""2017-05-09T20:…",1,"""cashtag I wan…"
1,"""2023-02-08T21:…",1,"""cashtag Don't…"
2,"""2022-05-13T08:…",0,"""cashtag Anoth…"
3,"""2019-09-18T06:…",1,"""cashtag iPhon…"
4,"""2023-06-05T13:…",0,"""cashtag casht…"
…,…,…,…
249995,"""2019-08-13T19:…",1,"""cashtag damm …"
249996,"""2013-03-12T18:…",1,"""cashtag IDC J…"
249997,"""2021-09-02T14:…",1,"""cashtag bears…"
249998,"""2023-11-02T19:…",0,"""cashtag just …"


#### Comment faire un value_counts()? 

In [58]:
df_pd['true_sentiment'].value_counts()

true_sentiment
1    125000
0    125000
Name: count, dtype: int64

In [59]:
df['true_sentiment'].value_counts()

true_sentiment,count
i8,u32
0,125000
1,125000


#### Supprimer une colonne 

In [66]:
df_pd_dropped = df_pd.drop('Unnamed: 0', axis=1)
df_pd_dropped.head(2)

Unnamed: 0,date,true_sentiment,content_cleaned
0,2017-05-09T20:32:06Z,1,cashtag I wanna sell some shares to buy me an...
1,2023-02-08T21:31:58Z,1,"cashtag Don't forget cashtag tomorrow, heade..."


In [68]:
df_dropped = df.drop('Unnamed: 0')
df_dropped.head(2)

date,true_sentiment,content_cleaned
str,i8,str
"""2017-05-09T20:…",1,"""cashtag I wan…"
"""2023-02-08T21:…",1,"""cashtag Don't…"
