# Text extraction

In this notebook, we show how temperatures and wind data from 2002 until now are extracted from avalanche reports that we previously downloaded.

## 1: Temperature extraction:

In [350]:
import numpy as np
import pandas as pd
import re
import glob
import os
import dateutil

We define some functions that will be used to extract temperatures from text.

In [468]:
# Allows to replace some similar words by the seme words
def replace_words(string, words, token):
    for w in words:
        string = string.replace(w, token)
    
    return string

# Obtain the location for each temperature
def get_location(pattern, temp):
    ind = [m.end() for m in pattern.finditer(paragraph.replace("\n"," "))]
    for i in range(len(temp)):
        if len(ind)-1>=i:
            if paragraph.find('nord',ind[i],ind[i]+25) != -1:
                temp[i] = temp[i] + ' nord'
            elif paragraph.find('sud',ind[i],ind[i]+25) != -1:
                temp[i] = temp[i] + ' sud'
            elif paragraph.find('ouest',ind[i],ind[i]+25) != -1:
                temp[i] = temp[i] + ' ouest'
            elif paragraph.find('est',ind[i],ind[i]+25) != -1:
                temp[i] = temp[i] + ' est'
    
    return temp



# initialize_variables():

# We will replace all words that are similar to degre with the same word:degre
words_1 = ['degré','degrés','degre','degres','degree','degrees','degrée','degrées','°','°C','° C']
token_1 = 'degre'
# Same for the term plus
words_2 = ['+','jusqu''au-dela de ','au-dela ']
token_2 = 'plus '
# Same for the term moins
words_3 = ['-']
token_3 = 'moins '
# This change is made to allow paragraph selection
words_4 = ['Rétrospective météo','Retrospective meteo', 'situation générale', 'COUVERTURE NEIGEUSE', 'Retrospective météorologique', 'Retrospective meteorologique']
token_4 = 'situation generale'
# Turns 0 into a number
words_5 = ['zero','zéro']
token_5 = 'plus 0'
# Same principle for west location
words_6 = ['l''ouest']
token_6 = 'ouest'
# TSame principle for east location
words_7 = ['l''est']
token_7 = 'est'
# Turns one into a number
words_8 = ['un']
token_8 = '1'

# This list regroups the different localizations of temperatures
location = ['nord','sud','est','ouest','nord-est','nord-ouest','sud-est','sud-ouest']
    
# Creation of lists that will contain temperatures and dates
temperature = []
temperature_nord = []
temperature_sud = []
temperature_est = []
temperature_ouest = []
date = []
date_nord = []
date_sud = []
date_est = []
date_ouest = []

We previously transformed the pdf files into text files in order to do the text processing. Now we extract the temperatures from the files.

In [469]:
for year in range(2002,2018):
    # we select all the text files corresponding to 1 year
    path = "../data/slf/{}/nb/fr/txt".format(str(year))
    # the algorithm is run for each file of the same year
    for filename in glob.glob(os.path.join(path, '*.txt')):
        
        paragraph = []
        # Opening of a text file
        with open(filename) as file:
            
            content = file.read()
            
            # regroup the same words into one word using the function replace_words
            content = replace_words(content, words_1, token_1)
            content = replace_words(content, words_2, token_2)
            content = replace_words(content, words_3, token_3)
            content = replace_words(content, words_4, token_4)
            content = replace_words(content, words_5, token_5)
            content = replace_words(content, words_6, token_6)
            content = replace_words(content, words_7, token_7)
            content = replace_words(content, words_8, token_8)

            # collect paragraph in which temperature is present
            for text in content.split('\n\n\n'):
                if 'situation generale' in text.lower():
                    paragraph = text
                    
            neg_temp = []
            pos_temp = []
            # check if paragraph is not empty
            if (paragraph != []):
                
                N_temp = 0
                
                # We define different patterns using regex
                pattern_moins = re.compile(r"moins \d+ degre", re.IGNORECASE) # for negative temperatures
                pattern_plus = re.compile(r"plus \d+ degre", re.IGNORECASE) # for positive temperatures
                pattern = re.compile(r"\d+ degre", re.IGNORECASE) # in case no sign is present with the temperature (it's rare)
                
                # we use the patterns obtained with regex to get the right part of the paragraph containing sign and temperature
                neg_temp = [m[0] for m in pattern_moins.finditer(paragraph.replace("\n"," "))]
                pos_temp = [m[0] for m in pattern_plus.finditer(paragraph.replace("\n"," "))]
                if pos_temp == []:
                    pos_temp = ['plus ' + m[0] for m in pattern.finditer(paragraph.replace("\n",""))]
                # For example: pos_temp = ['plus 6 degre'] and neg_temp = ['moins 8 degre']
                
                # define the number of temperatures present in the paragraph:
                N_temp = sum([len(neg_temp),len(pos_temp)])
                
                # in case only one temperature is present in the file
                if N_temp == 1:
                    
                    # Negative temperatures:
                    if (neg_temp != []):
                        # Temperature is added in a list along with the date
                        temperature.append(-int(neg_temp[0].split()[1]))
                        date.append(dateutil.parser.parse(filename[27:35]))
                     
                    # Positive temperatures:
                    elif (pos_temp != []):
                        # Temperature is added in a list along with the date
                        temperature.append(int(pos_temp[0].split()[1]))
                        date.append(dateutil.parser.parse(filename[27:35]))
                                        
                # in case several temperatures are present in the file
                elif N_temp > 1:
                    
                    # we associate for each temperature its location (north, east...)
                    pos_temp = get_location(pattern_plus, pos_temp)
                    neg_temp = get_location(pattern_moins, neg_temp)
                    
                    for i in range(len(neg_temp)): # we deal with negative temperatures
                        
                        if len(neg_temp[i].split()) == 4: # if location was found
                            if neg_temp[i].split()[3] == 'nord':
                                # We update temperature vectors and dates vectors
                                temperature_nord.append(-int(neg_temp[i].split()[1]))
                                date_nord.append(dateutil.parser.parse(filename[27:35]))
                            if neg_temp[i].split()[3]  == 'sud':
                                # We update temperature vectors and dates vectors
                                temperature_sud.append(-int(neg_temp[i].split()[1]))
                                date_sud.append(dateutil.parser.parse(filename[27:35]))
                            if neg_temp[i].split()[3] == 'est':
                                # We update temperature vectors and dates vectors
                                temperature_est.append(-int(neg_temp[i].split()[1])) 
                                date_est.append(dateutil.parser.parse(filename[27:35]))
                            if neg_temp[i].split()[3] == 'ouest':
                                # We update temperature vectors and dates vectors
                                temperature_ouest.append(-int(neg_temp[i].split()[1])) 
                                date_ouest.append(dateutil.parser.parse(filename[27:35]))

                    
                    for i in range(len(pos_temp)): # we deal with positive temperatures
                        
                        if len(pos_temp[i].split()) == 4: # if location was found
                            if pos_temp[i].split()[3] == 'nord': 
                                # We update temperature vectors and dates vectors
                                temperature_nord.append(int(pos_temp[i].split()[1]))
                                date_nord.append(dateutil.parser.parse(filename[27:35]))
                            if pos_temp[i].split()[3]  == 'sud':
                                # We update temperature vectors and dates vectors
                                temperature_sud.append(int(pos_temp[i].split()[1]))
                                date_sud.append(dateutil.parser.parse(filename[27:35]))
                            if pos_temp[i].split()[3] == 'est':
                                # We update temperature vectors and dates vectors
                                temperature_est.append(int(pos_temp[i].split()[1])) 
                                date_est.append(dateutil.parser.parse(filename[27:35]))
                            if pos_temp[i].split()[3] == 'ouest':
                                # We update temperature vectors and dates vectors
                                temperature_ouest.append(int(pos_temp[i].split()[1])) 
                                date_ouest.append(dateutil.parser.parse(filename[27:35]))


We define the dataframes in which temperatures and corresponding dates are inserted.

In [470]:
temp_df = pd.DataFrame({'Date':date,'Temperature':temperature})
temp_df = temp_df.drop_duplicates(subset='Date', keep='first')
temp_df = temp_df.set_index('Date')

# North location temperature dataframe:
temp_df_nord = pd.DataFrame({'Date':date_nord,'Temperature Nord':temperature_nord})
temp_df_nord = temp_df_nord.drop_duplicates(subset='Date', keep='first')
temp_df_nord = temp_df_nord.set_index('Date')

# South location temperature dataframe:
temp_df_sud = pd.DataFrame({'Date':date_sud,'Temperature Sud':temperature_sud})
temp_df_sud = temp_df_sud.drop_duplicates(subset='Date', keep='first')
temp_df_sud = temp_df_sud.set_index('Date')

# East location temperature dataframe:
temp_df_est = pd.DataFrame({'Date':date_est,'Temperature est':temperature_est})
temp_df_est = temp_df_est.drop_duplicates(subset='Date', keep='first')
temp_df_est = temp_df_est.set_index('Date')

# West location temperature dataframe:
temp_df_ouest = pd.DataFrame({'Date':date_ouest,'Temperature ouest':temperature_ouest})
temp_df_ouest = temp_df_ouest.drop_duplicates(subset='Date', keep='first')
temp_df_ouest = temp_df_ouest.set_index('Date')

We concatenate all dataframes to get only one dataframe containing all temperatures with dates as the index

In [471]:
new_df = pd.concat([temp_df, temp_df_nord, temp_df_sud, temp_df_est, temp_df_ouest], axis=1)
print('We collected temperatures for %d dates' %len(new_df))
new_df.head()

We collected temperatures for 2091 dates


Unnamed: 0_level_0,Temperature,Temperature Nord,Temperature Sud,Temperature est,Temperature ouest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-11-12,15.0,,,,
2001-11-25,8.0,,,,
2001-11-26,0.0,,,,
2001-12-02,4.0,,,,
2001-12-05,0.0,,,,


We check if our algorithm has worked by selecting a sampling of 20 dates.

In [357]:
new_df.sample(20)

Unnamed: 0_level_0,Temperature,Temperature Nord,Temperature Sud,Temperature est,Temperature ouest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-04-25,,,5.0,-3.0,
2007-04-14,8.0,,,,
2009-10-20,,,3.0,,
2008-03-31,,-3.0,0.0,,
2017-03-28,,2.0,4.0,,
2007-03-22,,,-6.0,-11.0,-8.0
2009-04-01,,6.0,1.0,,
2008-03-17,,-3.0,2.0,,
2011-12-09,0.0,,,,
2016-12-27,,,3.0,-6.0,1.0


From the sample we took, we obtain an accuracy of 95% for the extraction of temperatures.
Thus it is reasonable to use these temperatures for further analysis.

## 1: Wind extraction:

Now we will extract wind data from avalanche reports.
There are no exact numbers like wind speed in the reports, but an evulation of the strength of the wind is given.
Thus, our output will be categorical: strong, moderate or weak.

First some variables are initialized.

In [472]:
# We will replace all words that are similar to moderate with the same word
words_1 = ['modéré','modere','modérés','moderes']
token_1 = 'modere' 
# Same for the term 'fort' (strong)
words_2 = ['fort','forts']
token_2 = 'fort'
# Same for the term 'faible' (weak)
words_3 = ['faible','faibles']
token_3 = 'faible'
# This change is made to allow paragraph selection
words_4 = ['Rétrospective météo','Retrospective meteo', 'situation générale', 'COUVERTURE NEIGEUSE', 'Retrospective météorologique', 'Retrospective meteorologique']
token_4 = 'situation generale'
    
# Creation of lists that will contain wind data and dates
wind = []
date = []

We now extract the wind data

In [473]:
for year in range(2002,2018):
    # we select all the text files corresponding to 1 year
    path = "../data/slf/{}/nb/fr/txt".format(str(year))
    # the algorithm is run for each file of the same year
    for filename in glob.glob(os.path.join(path, '*.txt')):
        
        paragraph = []
        # Opening of a text file
        #filename = '../data/slf/2002/nb/fr/txt/20020102_nb_fr_bw.txt'
        with open(filename) as file:
            
            content = file.read()
            
            # regroup the same words into one word using the function replace_words
            content = replace_words(content, words_1, token_1)
            content = replace_words(content, words_2, token_2)
            content = replace_words(content, words_3, token_3)
            content = replace_words(content, words_4, token_4)

            # collect paragraph in which wind information is present
            for text in content.split('\n\n\n'):
                if 'situation generale' in text.lower():
                    paragraph = text
            
            #initialize variables
            wind_fort = []
            wind_faible = []
            wind_modere = []
            # check if paragraph is not empty
            if (paragraph != []):
                
                # We select a string containing the wind values
                # Using regex, one pattern is created for each wind strength
                pattern_fort = re.compile(r"fort", re.IGNORECASE)
                pattern_faible = re.compile(r"faible", re.IGNORECASE)
                pattern_modere = re.compile(r"modere", re.IGNORECASE)
                
                # we use the patterns obtained with regex to get the wind strength value
                wind_fort = [m[0] for m in pattern_fort.finditer(paragraph.replace("\n",""))]
                wind_faible = [m[0] for m in pattern_faible.finditer(paragraph.replace("\n",""))]
                wind_modere = [m[0] for m in pattern_modere.finditer(paragraph.replace("\n",""))]
                
                if wind_fort != []:
                    wind.append(wind_fort[0])
                    date.append(dateutil.parser.parse(filename[27:35]))
                
                elif wind_faible !=[]:
                    wind.append(wind_faible[0])
                    date.append(dateutil.parser.parse(filename[27:35]))
                
                elif wind_modere != []:
                    wind.append(wind_modere[0])
                    date.append(dateutil.parser.parse(filename[27:35]))
                

We define the dataframe in which wind and corresponding dates are inserted.

In [362]:
wind_df = pd.DataFrame({'Date':date,'Wind':wind})
wind_df = wind_df.drop_duplicates(subset='Date', keep='first')
wind_df = wind_df.set_index('Date')
print('We collected wind information for %d dates' %len(wind_df))
wind_df.head()

We collected wind information for 2677 dates


Unnamed: 0_level_0,Wind
Date,Unnamed: 1_level_1
2001-11-12,faible
2001-11-23,faible
2001-11-24,faible
2001-11-25,faible
2001-11-27,faible


We check if our algorithm has worked by selecting a sampling of 20 dates.

In [370]:
wind_df.sample(20)  

Unnamed: 0_level_0,Wind
Date,Unnamed: 1_level_1
2005-05-25,fort
2002-02-09,faible
2015-02-26,fort
2004-02-02,fort
2006-05-01,faible
2009-11-29,fort
2014-02-25,fort
2014-12-12,fort
2009-05-05,modere
2008-03-26,faible
