# Text extraction

In this section, we show how temperatures and wind data from 2002 until now are extracted from avalanche reports that we previously downloaded.

## 1: Temperature extraction:

In [202]:
import numpy as np
import pandas as pd
import re
import glob
import os
import dateutil

We define some functions that will be used to extract temperatures from text.

In [203]:
# regex pattern used to select the temperature data
temperature_pattern = re.compile(r"(moins |plus )?(\d+) degre", re.IGNORECASE)
paragraph_pattern = re.compile(r"(Rtrospective mto)?(.)?(Perspective mto)", re.IGNORECASE)
CONTEXT = 25
main_directions = ['nord', 'sud', 'est', 'ouest']
directions = main_directions + ['nord-est', 'nord-ouest', 'sud-est', 'sud-ouest']


def replace_words(string, tokenize_map):
    """Replace similar words by a token
    tokenize_map should be a dict(word -> token)
    """
    for w, t in tokenize_map.items():
        string = string.replace(w, t)
    return string

def get_paragraph(content,year):
    """Return the paragraph that contains the temperatures
    returns the paragraph string
    """
    paragraph = None
    if year<2013:
        for text in content.split('\n\n\n'):
            text = text.lower()
            if 'situation generale' in text:
                paragraph = text
    else:
        for match in re.compile(r"Rtrospective mto(.*)Perspective mto", re.IGNORECASE).finditer(content.replace('\n',' ')):
            paragraph = match[0]
            
    return paragraph
            

def extract_temperatures(paragraph):
    """Obtain the location for each temperature
    returns a dict(region -> temperature)
    """
    result = {}
    
    ts = []
    for match in temperature_pattern.finditer(paragraph):
        sign = -1 if match[1] == 'moins ' else 1
        value = int(match[2])
        end = match.end()
        ts.append((sign * value, end))
    
    if len(ts) == 1:
        result['default'] = ts[0][0]
    elif len(ts) > 1:
        for value, end in ts:
            for direction in main_directions:
                if direction in paragraph[end:end + CONTEXT]:
                    result[direction] = value
                    break
    
    return result
    
tokens = {
    'degre': ['degré', 'degrés', 'degre', 'degres', 'degree', 'degrees', 'degrée', 'degrées', '°', '°C', '° C'],
    'plus ': ['+', 'jusqu\'au-dela de ', 'au-dela '],
    'moins ': ['-'],
    'situation generale': ['situation générale', 'COUVERTURE NEIGEUSE'],
    'Rtrospective mto':['Rétrospective mto', 'Retrospective meteo','Rtrospective mtorologique', 'Retrospective meteorologique'],
    'Perspective mto':['Perspective meteo','Tendance','tendance'],
    'plus 0': ['zero', 'zéro'],
    'ouest': ['l''ouest'],
    'est': ['l''est'],
    '1': ['un'],
}

tokens_map = {word: token for token, words in tokens.items() for word in words}

temperatures = {
    'default': [],
    'nord': [],
    'sud': [],
    'est': [],
    'ouest': [],
}

We previously transformed the pdf files into text files in order to do the text processing. Now we extract the temperatures for each file of each year between 2002 and now, using our algorithm described in milestone2.md. 

In [204]:
no_situation_paragraph = 0 
total_files = 0

for year in range(2002,2018):
    path = "../data/slf/{}/nb/fr/{}".format(str(year), 'txt' if year < 2013 else 'txt_extracted')
    
    for filename in glob.glob(os.path.join(path, '*.txt')):
        file_date = dateutil.parser.parse(filename[27:35] if year < 2013 else filename[37:45])
        total_files += 1

        with open(filename, 'rb') as file:
            content = file.read().decode("utf-8", "ignore")
            content = replace_words(content, tokens_map)
            
            # select paragraph in which temperatures are present
            paragraph = get_paragraph(content,year)            
            
            if not paragraph:
                no_situation_paragraph += 1
                
            else:
                paragraph = paragraph.replace("\n", " ")
                ts = extract_temperatures(paragraph)
                for direction, t in ts.items():
                    temperatures[direction].append((file_date, t))

print('Total number of report without situation paragraph: {}/{}'.format(no_situation_paragraph, total_files))

Total number of report without situation paragraph: 54/3544


We organize our extracted data inside a dataframe for further analysis

In [205]:
records = [(date, region, t) for region, ts in temperatures.items() for date, t in ts]

In [206]:
results = pd.DataFrame(records, columns=['date', 'region', 'temperature'])
results.region = results.region.str.replace('default', '-')
results = results.sort_values(by='date')
print('We collected temperatures for %d dates' %len(results.date.unique()))
results = results.set_index(['date', 'region'])
results.head(15)

We collected temperatures for 1523 dates


Unnamed: 0_level_0,Unnamed: 1_level_0,temperature
date,region,Unnamed: 2_level_1
2001-11-12,-,15
2001-11-25,-,8
2001-11-26,-,0
2001-11-29,-,-5
2001-12-05,-,0
2001-12-08,-,-2
2001-12-10,-,2
2001-12-11,-,-3
2001-12-12,-,-5
2001-12-13,-,-15


We checked if our algorithm has worked by selecting a sampling of 20 dates.

## 2: Wind extraction:

Now we will extract wind data from avalanche reports.
There are no exact numbers like wind speed in the reports, but an evaluation of the strength of the wind is given.
Thus, our output will be categorical: strong, moderate or weak.

First some variables are initialized and some functions are defined.

In [215]:
# regex pattern used to select the wind information 
wind_pattern = re.compile(r"(fort |faible |modere )", re.IGNORECASE)
CONTEXT = 25
main_directions = ['nord', 'sud', 'est', 'ouest']
directions = main_directions + ['nord-est', 'nord-ouest', 'sud-est', 'sud-ouest']

def extract_wind(paragraph):
    """Obtain the location for each temperature
    returns a dict(region -> temperature)
    """
    result = {}
    
    ts = []
    for match in wind_pattern.finditer(paragraph):
        value = match[0]
        end = match.end()
        ts.append((value, end))
    
    if len(ts) == 1:
        result['default'] = ts[0][0]
    elif len(ts) > 1:
        for value, end in ts:
            for direction in main_directions:
                if direction in paragraph[end:end + CONTEXT]:
                    result[direction] = value
                    break
    
    return result

tokens = {
    'modere': ['modéré','modere','modérés','moderes'],
    'fort': ['forts'],
    'faible': ['faibles'],
    'situation generale': ['situation générale', 'COUVERTURE NEIGEUSE'],
    'Rtrospective mto':['Rétrospective mto', 'Retrospective meteo','Rtrospective mtorologique', 'Retrospective meteorologique'],
    'Perspective mto':['Perspective meteo','Tendance','tendance'],
    'ouest': ['l''ouest'],
    'est': ['l''est'],
}

tokens_map = {word: token for token, words in tokens.items() for word in words}

wind = {
    'default': [],
    'nord': [],
    'sud': [],
    'est': [],
    'ouest': [],
}

Now for each file of each year between 2002 and now, we use our algorithm (as explained in milestone2.md) to extract wind information. 

In [211]:
filename

'../data/slf/2013/nb/fr/txt\\20121008_nb_fr_bw.txt'

In [216]:
no_situation_paragraph = 0
total_files = 0

for year in range(2002,2018):
    path = "../data/slf/{}/nb/fr/txt".format(str(year), 'txt' if year < 2014 else 'txt_extracted')
    
    for filename in glob.glob(os.path.join(path, '*.txt')):
        file_date = dateutil.parser.parse((filename[27:35] if year < 2014 else filename[37:45]))
        total_files += 1

        with open(filename, 'rb') as file:
            content = file.read().decode("utf-8", "ignore")
            content = replace_words(content, tokens_map)
            
            # select paragraph in which temperatures are present
            paragraph = get_paragraph(content,year)  
            
            if not paragraph:
                no_situation_paragraph += 1
            
            else:
                paragraph = paragraph.replace("\n", " ")
                ts = extract_wind(paragraph)
                for direction, t in ts.items():
                    wind[direction].append((file_date, t))

print('Total number of report without situation paragraph: {}/{}'.format(no_situation_paragraph, total_files))

Total number of report without situation paragraph: 55/2063


We define the dataframe in which wind and corresponding dates are inserted.

In [217]:
records = [(date, region, t) for region, ts in wind.items() for date, t in ts]

In [218]:
results = pd.DataFrame(records, columns=['date', 'region', 'wind'])
results.region = results.region.str.replace('default', '-')
results = results.sort_values(by='date')
print('We collected wind data for %d dates' %len(results.date.unique()))
results = results.set_index(['date', 'region'])
results.head(15)

We collected wind data for 1479 dates


Unnamed: 0_level_0,Unnamed: 1_level_0,wind
date,region,Unnamed: 2_level_1
2001-11-12,sud,modere
2001-11-12,nord,modere
2001-11-23,-,faible
2001-11-24,-,faible
2001-11-25,nord,modere
2001-12-01,nord,modere
2001-12-02,-,faible
2001-12-04,est,modere
2001-12-05,-,faible
2001-12-07,-,fort


We checked if our algorithm has worked by selecting a sampling of 20 dates.