In [0]:
!rm -rf challenge-m2-sid

In [6]:
# OS setup
!cat /etc/os-release
!apt-get install -qq bc tree sox

# Liaison avec les données
!git clone "https://etudiantsid:etudiantsidPW;@gitlab.com/jeromefarinas/challenge-m2-sid.git"

NAME="Ubuntu"
VERSION="18.04.3 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.3 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic
Selecting previously unselected package libopencore-amrnb0:amd64.
(Reading database ... 132681 files and directories currently installed.)
Preparing to unpack .../0-libopencore-amrnb0_0.1.3-2.1_amd64.deb ...
Unpacking libopencore-amrnb0:amd64 (0.1.3-2.1) ...
Selecting previously unselected package libopencore-amrwb0:amd64.
Preparing to unpack .../1-libopencore-amrwb0_0.1.3-2.1_amd64.deb ...
Unpacking libopencore-amrwb0:amd64 (0.1.3-2.1) ...
Selecting previously unselected package libmagic-mgc.
Preparing to unpack .../2-libmagic-mgc_1%3a5.32-2ubuntu0.2_amd64.deb ...
Unpacking libmagic-mgc (1:5.32-2ubuntu0.2)

In [0]:
import matplotlib.pyplot as plt
import scipy.io.wavfile
import scipy.signal
import numpy as np
from IPython.display import Audio
import librosa
import librosa.display
import pandas as pd
import os
from nltk.corpus import stopwords
import string
import re
from sklearn import preprocessing
from google.colab import drive
import xml.etree.ElementTree as ET
import spacy
from google.colab import files
from sklearn import preprocessing




In [0]:
path_text = 'challenge-m2-sid/corpus/text/'
List_txt = os.listdir(path_text)
List_txt.sort()

# Section 

In [0]:
# Get only text betwen headers 
def get_sentences(List_txt, path_text):
    '''
    
    :param List_txt : list : file names on which it will be computed
    :param path_text : string : path to the directory where to find files  
    :return: dictionnary : contains list of sentences for each document
    '''
  dic_docs = {}
  for doc in List_txt:
    root = ET.parse(path_text + doc).getroot()
    dic_docs[doc] = []
    for s in root:
      sentence = ''
      for w in s:
        word = w.text
        if (word is not None):
          sentence = sentence + word
      dic_docs[doc].append(sentence)
  return(dic_docs)

In [0]:
# Remove ponctuation in sentences 
def clean_sentences(dic_docs):  
    '''
    :param dic_docs: dictionnary that contains list of sentences for each doc
    :return: dic_docs : dictionnary that contains list of sentences without 
    ponctuation and simple quotes; return "clean sentences"
    '''
  for key in dic_docs.keys() : 
    list_new = []
    for sentence in dic_docs[key]:
      sentence = sentence.replace("'", ' ').replace("’", ' ')
      sentence = re.sub("([^\s\w\-])", '',sentence)
      list_new.append(sentence)
    dic_docs[key] = list_new
  return(dic_docs)

In [0]:
# Read excel lexique file 
def read_excel(file):
  # Change into def get_lexique(excel_file): ?
  '''
  Read excel file and return a pandas DataFrame
  :param file :  string : path of the excel file to read
  :return: df_lex : DataFrame : pandas DataFrame containing 
  '''
  dfs = pd.ExcelFile(file)
  sh = dfs.sheet_names[0]
  df_lex = dfs.parse('Sheet1')
  df_lex = df_lex[['ortho', 'lemme', 'cgram', 'freqlemfilms2']]
  serie = df_lex['freqlemfilms2']
  normalized_serie=(serie)/max(serie)
  df_lex['freqlemfilms2_norm'] = normalized_serie
  return(df_lex)

In [0]:
# Removes spaces, empty word and lower all  
def get_clean_words(dic_docs):
  '''
  Remove spaces, empty words and lower every words in text of each document
  :param dic_docs : dictionnary : contains sentences for each document 
  :return: dic_docs : dictionnary : contains sentences with only lowered 
  useful words  
  '''
  for doc in dic_docs.keys() : 
    list_words = []
    for sentence in dic_docs[doc]:
      for word in sentence.split():
          w = word.replace(' ', '')
          if len(w) != 0:
            list_words.append(w.lower())
    dic_docs[doc] = list_words
  return(dic_docs)

In [0]:
# Get complexity score of one text document
def get_complexity_doc(doc, df_lex, dic_docs,threshold_complex = 0.0001):
  '''
  Get text complexity score of one document : the score is based on
  words frequencies in french. We assume that if a word is not often
  used, this word is a complex word.

  :param doc :  string : path of the excel file to read
  :param df_lex :  pandas.DataFrame : contains the french lexique and 
  frequencies  
  :param dic_docs :   
  :param threshold_complex : int : threshold that define what is a 
  complex word (based on frequencies of vocabulary document)
  :return: df_lex : DataFrame : pandas DataFrame containing 
  '''
  cplxty = 0
  for word in list(set(dic_docs[doc])):
    try : 
      freq = max(df_lex[df_lex['ortho']==word]['freqlemfilms2_norm'])
    except :
      if len(word) >= 3:
        freq = 0
      else : 
        freq = 1
    if freq < threshold_complex:
      cplxty = cplxty + 1

  cplxty = cplxty/(len(list(set(dic_docs[doc]))))
  return(cplxty)

In [0]:
# Get complexity score of a list of text document
def get_all_cplx(List_txt, df_lex, dic_docs):
  dic_cplx = {}
  i = 1
  N = len(List_txt)
  for doc in List_txt:
    print(str(i) + ' / ' + str(N))
    dic_cplx[doc] = get_complexity_doc(doc, df_lex, dic_docs)
    i = i + 1
  return(dic_cplx)

In [0]:

def normalize_results(dic_cplx, ma):
  dic_cplx_N = {}
  for doc in dic_cplx.keys():
    score = dic_cplx[doc]
    score = score * 100 /ma
    dic_cplx_N[doc] = score
  return(dic_cplx_N)

In [0]:
def list_max_docid(medium, only_commented):
  # Get the list of annotated extracts for a medium
  medium = get_medium(medium, only_commented)
  # Get list of files identifiants
  list_file = medium["code_doc"]
  # Return the list of couple (doc_id, evaluation max of complexity)
  # Return [(doc_id, max(annot)),....]
  return {(i[0][:-6]+".xml"):i[1:].max() for i in medium[medium.columns[-7:]].values }