# Modificando o Dataset

In [1]:
import numpy as np
import pandas as pd

## >RE INDEX

In [2]:
index = ['Firefox','Chrome','Safari','IE10','Konqueror']
df = pd.DataFrame({'http_status':[200,200,404,404,301],
                   'response_time':[0.04,0.02,0.07,0.08,1.0]},index=index)
df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


In [3]:
new_index = ['Safari', 'Iceweasel', 'Comodo Dragon','IE10','Chrome']

df.reindex(new_index)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


## >RESET INDEX

In [4]:
df = pd.DataFrame([('bird', 389.0),('bird', 24.0),('mammal', 80.5),('mammal', np.nan)],
                  index=['falcon', 'parrot', 'lion', 'monkey'],
                  columns=('class', 'max_speed'))
df

Unnamed: 0,class,max_speed
falcon,bird,389.0
parrot,bird,24.0
lion,mammal,80.5
monkey,mammal,


In [5]:
df.reset_index()

Unnamed: 0,index,class,max_speed
0,falcon,bird,389.0
1,parrot,bird,24.0
2,lion,mammal,80.5
3,monkey,mammal,


In [6]:
df.reset_index(drop=True) # Remove o index anterior

Unnamed: 0,class,max_speed
0,bird,389.0
1,bird,24.0
2,mammal,80.5
3,mammal,


## >SORT INDEX 

In [7]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale': [55, 40, 84, 31]})

df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [8]:
df['sale'].sort_values()

3    31
1    40
0    55
2    84
Name: sale, dtype: int64

In [9]:
df['sale'].sort_index()

0    55
1    40
2    84
3    31
Name: sale, dtype: int64

## >SET INDEX

In [10]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale': [55, 40, 84, 31]})

df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [11]:
df.set_index('month')

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


#### ===========================================================================================================
#### -------------------------------------------------------------------------------------------------------------------------------------------------------------
#### ===========================================================================================================

## >REPLACE

In [25]:
df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
df

Unnamed: 0,A,B
0,bat,abc
1,foo,bar
2,bait,xyz


In [26]:
df['A']=df['A'].replace(('foo'),('Ram'))
df

Unnamed: 0,A,B
0,bat,abc
1,Ram,bar
2,bait,xyz


## >DROP LEVEL

In [28]:
df = pd.DataFrame([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]
]).set_index([0,1]).rename_axis(['a','b'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,2,3
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,3,4
5,6,7,8
9,10,11,12


In [29]:
df.droplevel('a')

Unnamed: 0_level_0,2,3
b,Unnamed: 1_level_1,Unnamed: 2_level_1
2,3,4
6,7,8
10,11,12


#### ===========================================================================================================
#### -------------------------------------------------------------------------------------------------------------------------------------------------------------
#### ===========================================================================================================

## >SPLIT

In [30]:
s = "x y z"

print(s.split(' '))
print(s.split(' ')[0])

['x', 'y', 'z']
x


In [32]:
dados = pd.read_csv('Dados/movie.csv')
dados.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
dados['genres'].str.split('|')

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9120                          [Adventure, Drama, Romance]
9121                 [Action, Adventure, Fantasy, Sci-Fi]
9122                                        [Documentary]
9123                                             [Comedy]
9124                                        [Documentary]
Name: genres, Length: 9125, dtype: object

In [37]:
dados['y'] = dados['title'].str.split(' ')
dados['year'] = dados['y'].apply(lambda x: x[-1])
dados.drop(columns=['y'])
dados['year']

0        (1995)
1        (1995)
2        (1995)
3        (1995)
4        (1995)
         ...   
9120     (2016)
9121     (2016)
9122     (2016)
9123     (1936)
9124    Unboxed
Name: year, Length: 9125, dtype: object

In [39]:
dados['year'] = dados['year'].str.strip(')')
dados['year'] = dados['year'].str.strip('(')
dados['year']

0          1995
1          1995
2          1995
3          1995
4          1995
         ...   
9120       2016
9121       2016
9122       2016
9123       1936
9124    Unboxed
Name: year, Length: 9125, dtype: object

#### ==================================================================================================================
#### ------------------------------------------------------------------------------------------------------------------------------------------------------------------
#### ==================================================================================================================

## > STACK

In [40]:
df = pd.DataFrame([[0, 1], [2, 3]],
                    index=['cat', 'dog'],
                    columns=['weight', 'height'])
df

Unnamed: 0,weight,height
cat,0,1
dog,2,3


In [41]:
df.stack()

cat  weight    0
     height    1
dog  weight    2
     height    3
dtype: int64

## >UNSTACK

In [42]:
df.unstack()

weight  cat    0
        dog    2
height  cat    1
        dog    3
dtype: int64

In [43]:
s = df.stack()
s.unstack()

Unnamed: 0,weight,height
cat,0,1
dog,2,3


In [44]:
s.unstack(level=0)

Unnamed: 0,cat,dog
weight,0,2
height,1,3


#### ==================================================================================================================
#### ------------------------------------------------------------------------------------------------------------------------------------------------------------------
#### ==================================================================================================================

## >MELT

In [17]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [18]:
df.melt(id_vars=['A'], value_vars=['B'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5


In [19]:
df.melt(id_vars=['A'], value_vars=['B', 'C'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [14]:
df = pd.DataFrame({'Name': {0:'Ritika',1:'shyam',2:'neil'},
                  'Course': {0:'Masters',1:'Graduate',2:'Masters'},
                  'Age': {0:22,1:20,2:24}})
df

Unnamed: 0,Name,Course,Age
0,Ritika,Masters,22
1,shyam,Graduate,20
2,neil,Masters,24


In [15]:
df.melt(id_vars=['Name'],value_vars=['Course','Age'])

Unnamed: 0,Name,variable,value
0,Ritika,Course,Masters
1,shyam,Course,Graduate
2,neil,Course,Masters
3,Ritika,Age,22
4,shyam,Age,20
5,neil,Age,24


## >EXPLODE

In [20]:
df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
df

Unnamed: 0,A,B
0,"[1, 2, 3]",1
1,foo,1
2,[],1
3,"[3, 4]",1


In [21]:
df.explode('A')

Unnamed: 0,A,B
0,1,1
0,2,1
0,3,1
1,foo,1
2,,1
3,3,1
3,4,1


## >Squeeze

In [22]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
df

Unnamed: 0,a,b
0,1,2
1,3,4


In [23]:
df_a = df[['a']]
df_a

Unnamed: 0,a
0,1
1,3


In [24]:
df_a.squeeze()

0    1
1    3
Name: a, dtype: int64