#### operation on column names

In [0]:
df = pd.DataFrame({'a':[1,1,1,2,2,3,4,5],
                  'b':[10,10,11,20,0,0,40,50],
                  'c':['apple','apple','plum','pear','plum','apple','apple','apple']})
df

Unnamed: 0,a,b,c
0,1,10,apple
1,1,10,apple
2,1,11,plum
3,2,20,pear
4,2,0,plum
5,3,0,apple
6,4,40,apple
7,5,50,apple


In [0]:
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [0]:
df.rename(columns={'a':'A',
                  'b':'B'}, inplace=True)

In [0]:
df.columns

Index(['A', 'B', 'c'], dtype='object')

In [0]:
df.columns = ['aa','bb','cc']

In [0]:
df

Unnamed: 0,aa,bb,cc
0,1,10,apple
1,1,10,apple
2,1,11,plum
3,2,20,pear
4,2,0,plum
5,3,0,apple
6,4,40,apple
7,5,50,apple


#### String operations

In [0]:
mystring = 'This is an apple '
mystring

'This is an apple '

In [0]:
mystring.strip(' ')

'This is an apple'

In [0]:
mystring.strip('This')

' is an apple '

In [0]:
mystring.replace('apple','pear')

'This is an pear '

In [0]:
mystring.split(' ')

['This', 'is', 'an', 'apple', '']

In [0]:
mystring.replace('apple','pear').strip(' ').split(' ')

['This', 'is', 'an', 'pear']

In [0]:
one_char = mystring[0]
one_char

'T'

In [0]:
one_char.islower(), one_char.isupper()

(False, True)

In [0]:
mystring.upper()

'THIS IS AN APPLE '

In [0]:
' '.join(['This', 'is', 'an', 'apple'])

'This is an apple'

#### Regular expressions

Search

In [0]:
import re

In [0]:
pattern = 'pattern'
text = 'Is the pattern in here?'
match = re.search(pattern, text)

In [0]:
match

<_sre.SRE_Match object; span=(7, 14), match='pattern'>

In [0]:
match.start(), match.end()

(7, 14)

In [0]:
match.group()

'pattern'

In [0]:
if match:
    print('Pattern is found')

Pattern is found


Multiple occurrences

In [0]:
pattern = 'er'
text = 'There can be multiple patterns as well'

In [0]:
all_matches = re.findall(pattern, text)
all_matches

['er', 'er']

Finding regular expressions

In [0]:
text = 'abbaaabbbbaaaaa'
patterns = ['ab*',     # a followed by zero or more b
            'ab+',     # a followed by one or more b
            'ab?',     # a followed by zero or one b
            'ab{3}']   # a followed by three b

print(text)
for p in patterns:
    print(p, re.findall(p, text))

abbaaabbbbaaaaa
ab* ['abb', 'a', 'a', 'abbbb', 'a', 'a', 'a', 'a', 'a']
ab+ ['abb', 'abbbb']
ab? ['ab', 'a', 'a', 'ab', 'a', 'a', 'a', 'a', 'a']
ab{3} ['abbb']


In [0]:
text = 'ab aag AB somethingAAG'
patterns = ['[ab]+',     # either a or b
            '[AB]+',     # either a or b upper case
            '[a-z]+',     # any lower case letter
            '[A-Z]+',    # any upper case letter
            'something[A-Z]',
            '[a-zA-Z]+']  

print(text)
for p in patterns:
    print(p, re.findall(p, text))

ab aag AB somethingAAG
[ab]+ ['ab', 'aa']
[AB]+ ['AB', 'AA']
[a-z]+ ['ab', 'aag', 'something']
[A-Z]+ ['AB', 'AAG']
something[A-Z] ['somethingA']
[a-zA-Z]+ ['ab', 'aag', 'AB', 'somethingAAG']


In [0]:
text = 'You can. also find - without'
patterns = ['[^-. ]+',  # sequences without -, ., or space]  
            'a.', # a followed by any charachter
            'f.*d'] # f followed by any charachters ending with d

print(text)
for p in patterns:
    print(p, re.findall(p, text))

You can. also find - without
[^-. ]+ ['You', 'can', 'also', 'find', 'without']
a. ['an', 'al']
f.*d ['find']


In [0]:
text = 'Finding different types 1,2,3'
patterns = [r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
           ]

print(text)
for p in patterns:
    print(p, re.findall(p, text))

Finding different types 1,2,3
\d+ ['1', '2', '3']
\D+ ['Finding different types ', ',', ',']
\s+ [' ', ' ', ' ']
\S+ ['Finding', 'different', 'types', '1,2,3']
\w+ ['Finding', 'different', 'types', '1', '2', '3']
\W+ [' ', ' ', ' ', ',', ',']


#### List comprehension

In [0]:
mylist = ['adam','bruno','cecile']
new_list = []
for l in mylist:
    new_list.append(l.capitalize())
new_list

['Adam', 'Bruno', 'Cecile']

In [0]:
[x.capitalize() for x in mylist]

['Adam', 'Bruno', 'Cecile']

In [0]:
[x.capitalize() for x in mylist if not x.startswith('a')]

['Bruno', 'Cecile']

In [0]:
[re.findall('[aeiou]', x) for x in mylist]

[['a', 'a'], ['u', 'o'], ['e', 'i', 'e']]

## Excluding days with extreme weather conditions

### 1 - exercise

Import the pandas library and load the weather_filled.csv file you downloaded from the class 5 folder into a dataframe called weather. <br>
Check the name of the columns. Is the nameing consistent? 

In [8]:
# Your code here
import pandas as pd
weather = pd.read_csv('weather_filled.csv')
weather.columns

Index(['Date', 'Max_Temperature_F', 'Mean_Temperature_F', 'Min_TemperatureF',
       'Max_Dew_Point_F', 'MeanDew_Point_F', 'Min_Dewpoint_F', 'Max_Humidity',
       'Mean_Humidity ', 'Min_Humidity ', 'Max_Sea_Level_Pressure_In ',
       'Mean_Sea_Level_Pressure_In ', 'Min_Sea_Level_Pressure_In ',
       'Max_Visibility_Miles ', 'Mean_Visibility_Miles ',
       'Min_Visibility_Miles ', 'Max_Wind_Speed_MPH ', 'Mean_Wind_Speed_MPH ',
       'Max_Gust_Speed_MPH', 'Precipitation_In ', 'Cloud_Cover ', 'Events',
       'Wind_Dir_Degrees', 'landmark'],
      dtype='object')

In [9]:
mylist = weather.columns

In [11]:
nospace_colnames = [x.strip(' ') for x in mylist]

In [22]:
to_correct = 'MeanDewPoint_F'

In [14]:
import re

In [23]:
match = re.search('[a-z][A-Z]+', to_correct)
error_chars = match.group()
to_correct.replace(error_chars, error_chars[0] + '_' + error_chars[1])

'Mean_DewPoint_F'

In [31]:
corrected_colnames = []
for colname in mylist:
    error_list = re.findall('[a-z][A-Z]+', colname)
    for error_chars in error_list:
        colname = colname.replace(error_chars, 
                                  error_chars[0] + '_' + error_chars[1])
    corrected_colnames.append(colname.strip(' '))

In [32]:
corrected_colnames

['Date',
 'Max_Temperature_F',
 'Mean_Temperature_F',
 'Min_Temperature_F',
 'Max_Dew_Point_F',
 'Mean_Dew_Point_F',
 'Min_Dewpoint_F',
 'Max_Humidity',
 'Mean_Humidity',
 'Min_Humidity',
 'Max_Sea_Level_Pressure_In',
 'Mean_Sea_Level_Pressure_In',
 'Min_Sea_Level_Pressure_In',
 'Max_Visibility_Miles',
 'Mean_Visibility_Miles',
 'Min_Visibility_Miles',
 'Max_Wind_Speed_MPH',
 'Mean_Wind_Speed_MPH',
 'Max_Gust_Speed_MPH',
 'Precipitation_In',
 'Cloud_Cover',
 'Events',
 'Wind_Dir_Degrees',
 'landmark']

### 1 - check yourself

No, there are column names with an extra space at the end of the name <br>
and column names where the underscore is missing from between the words

### 2 - exercise

Correct these mistakes! You can choose how to do it, but at the end remove the space from the column names and put underscores wherever needed. <br><br> You can loop through the column names and use string operations to insert the underscore where needed. You can also use the re module to find these occurrences. With the re module you can do the replacement in one step as well. <br>
You can rename the columns inside the loop or you can build a list of the new column names and assign this new list as the new column names. You can use list comprehension for this.

In [None]:
### version 1

In [1]:
import pandas as pd
weather = pd.read_csv('weather_filled.csv')

In [2]:
c = 'MeanDew_Point_F'

In [3]:
for i in range(len(c) - 1):
    print(c)

MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F
MeanDew_Point_F


In [4]:
for i in range(len(c) - 1):
    print(c[i], c[i+1], c[i].islower(), c[i+1].isupper())

M e False False
e a True False
a n True False
n D True True
D e False False
e w True False
w _ True False
_ P False True
P o False False
o i True False
i n True False
n t True False
t _ True False
_ F False True


In [5]:
new_c = ''
for i in range(len(c) - 1):
    if c[i].islower() and c[i+1].isupper():
        new_c += c[i] + '_'
        print('XXXXXX')
    else:
        new_c += c[i]
    print(new_c)

M
Me
Mea
XXXXXX
Mean_
Mean_D
Mean_De
Mean_Dew
Mean_Dew_
Mean_Dew_P
Mean_Dew_Po
Mean_Dew_Poi
Mean_Dew_Poin
Mean_Dew_Point
Mean_Dew_Point_


In [7]:
new_c += c[-1]
new_c

'Mean_Dew_Point_FF'

In [3]:

for c in weather.columns:
    new_c = ''
    for i in range(len(c) - 1):
        if c[i].islower() and c[i+1].isupper():
            new_c += c[i] + '_'
        else:
            new_c += c[i]
    new_c += c[-1]
    weather.rename(columns={c:new_c.strip(' ')}, inplace=True)


In [None]:
### version 2

In [10]:
import re
weather = pd.read_csv('weather_filled.csv')

In [11]:
c = 'MeanDew_Point_F'

In [12]:
re.findall("[a-z][A-Z]", c)

['nD']

In [13]:
mu = 'nD'
c.replace(mu, mu[0] + '_' + mu[1])

'Mean_Dew_Point_F'

In [0]:


new_names = []
for c in weather.columns:
    missing_underscores = re.findall("[a-z][A-Z]", c)
    for mu in missing_underscores:
        c = c.replace(mu, mu[0] + '_' + mu[1])
    new_names.append(c.strip(' '))
weather.columns = new_names



In [None]:
### version 3

In [14]:
c = 'MeanDew_Point_F'

In [21]:
re.sub("([a-z])([A-Z])", "\\1_\\2", c)

'Mean_Dew_Point_F'

In [22]:
weather = pd.read_csv('weather_filled.csv')
weather.columns = [re.sub("([a-z])([A-Z])", "\\1_\\2", x).strip(' ')
                   for x in weather.columns]

### 2 - check yourself

In [23]:
if list(weather.columns) == ['Date', 'Max_Temperature_F', 'Mean_Temperature_F', 'Min_Temperature_F',
       'Max_Dew_Point_F', 'Mean_Dew_Point_F', 'Min_Dewpoint_F', 'Max_Humidity',
       'Mean_Humidity', 'Min_Humidity', 'Max_Sea_Level_Pressure_In',
       'Mean_Sea_Level_Pressure_In', 'Min_Sea_Level_Pressure_In',
       'Max_Visibility_Miles', 'Mean_Visibility_Miles', 'Min_Visibility_Miles',
       'Max_Wind_Speed_MPH', 'Mean_Wind_Speed_MPH', 'Max_Gust_Speed_MPH',
       'Precipitation_In', 'Cloud_Cover', 'Events', 'Wind_Dir_Degrees',
       'landmark']:
    print('The column names are correct')
else:
    print('The column names are NOT correct')

The column names are correct
