# Consigne

Vous devrez effectuer les opérations de traitement suivantes sur le texte, pas forcément dans cet ordre

- Créer des paires de document (article, highlights)
- Suppression de la ponctuation
- Séparation en token en minuscules
- Suppression des stopwords pour les articles
- Calcul des fréquences et tf-idf sur les deux types de documents
- Enregistrement du nouveau jeu de données d’entraînement pour usage ultérieur

In [1]:
from nltk import corpus, tokenize
corpus_root = './cnn/stories' #il faut déziper le tgz et placer le repo cnn au même niveau que le notebook
wordlists = corpus.PlaintextCorpusReader(corpus_root, '.*')

In [2]:
len(wordlists.fileids())

92579

In [3]:
wordlists.fileids()[:5]

['0001d1afc246a7964130f43ae940af6bc6c57f01.story',
 '0002095e55fcbd3a2f366d9bf92a95433dc305ef.story',
 '00027e965c8264c35cc1bc55556db388da82b07f.story',
 '0002c17436637c4fe1837c935c04de47adb18e9a.story',
 '0003ad6ef0c37534f80b55b4235108024b407f0b.story']

### Fonction de séparation en token en minuscules + suppression des stopwords/éléments de ponctuation + lemmatization

In [4]:
import string
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

stopwords = set(stopwords.words('english'))|{"'s","'d","'ll","'m","'re","'ve","say"}
punctuation = set(word_tokenize(string.punctuation))|{".","`","``","-","--","..."}
punctuation.remove("@")
#tokenizer = nltk.RegexpTokenizer('\w+')

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def preprocess(string):
    tmp = [WordNetLemmatizer().lemmatize(word, pos=penn2morphy(tag)) for word, tag in pos_tag(word_tokenize(string))]
    tmp = [str.lower(word) for word in tmp] 
    res = [word for word in tmp if word not in stopwords|punctuation]
    return res

### Création des paires de document (article, highlights) et calculs des term frequencies

In [5]:
# #décompte du nombre maximum d'highlights dans un article
# import re
# max_highlight = 0
# for fileid in wordlists.fileids():
#     highlight_freq = len(re.findall('(@highlight)',wordlists.raw(fileid)))
#     if highlight_freq > max_highlight:
#         max_highlight = highlight_freq

# print("At most there are {} highlights in an article".format(max_highlight)) 
# #on trouve max_highlight égal à 5

In [6]:
#on travaille sur un subset du corpus total pour limiter les temps de calcul
num_articles_to_work_on = 300

#création du dataframe
import pandas as pd
columns=["article_tf","highlight1_tf","highlight2_tf","highlight3_tf","highlight4_tf","highlight5_tf","article_idf", "article_tf-idf"]
df = pd.DataFrame(index=wordlists.fileids()[:num_articles_to_work_on], columns=columns)

In [7]:
#remplissage du dataframe
from collections import Counter

for fileid in df.index:
    strings = wordlists.raw(fileid).split("@highlight")
    data = [Counter(preprocess(string)).most_common() for string in strings]
    df.loc[fileid].iloc[:len(data)] = data

df.head(5)

Unnamed: 0,article_tf,highlight1_tf,highlight2_tf,highlight3_tf,highlight4_tf,highlight5_tf,article_idf,article_tf-idf
0001d1afc246a7964130f43ae940af6bc6c57f01.story,"[(obama, 28), (syria, 25), (military, 17), (sy...","[(syrian, 1), (official, 1), (obama, 1), (clim...","[(obama, 1), (send, 1), (letter, 1), (head, 1)...","[(obama, 1), (seek, 1), (congressional, 1), (a...","[(aim, 1), (determine, 1), (whether, 1), (cw, ...",,,
0002095e55fcbd3a2f366d9bf92a95433dc305ef.story,"[(bolt, 8), (gold, 7), (second, 7), (champions...","[(usain, 1), (bolt, 1), (win, 1), (third, 1), ...","[(anchors, 1), (jamaica, 1), (4x100m, 1), (rel...","[(eighth, 1), (gold, 1), (championship, 1), (b...","[(jamaica, 1), (double, 1), (woman, 1), (4x100...",,,
00027e965c8264c35cc1bc55556db388da82b07f.story,"[(gsa, 26), (employee, 16), (kansas, 9), (city...","[(employee, 1), (agency, 1), (kansas, 1), (cit...","[(employee, 1), (travel, 1), (mainland, 1), (u...","[(telecommuting, 1), (program, 1), (like, 1), ...",,,,
0002c17436637c4fe1837c935c04de47adb18e9a.story,"[(burkhart, 20), (fire, 18), (los, 10), (angel...","[(new, 1), (canadian, 1), (doctor, 1), (part, ...","[(new, 1), (diagnosis, 1), (autism, 1), (sever...","[(burkhart, 1), (also, 1), (suspect, 1), (germ...","[(prosecutors, 1), (believe, 1), (german, 1), ...",,,
0003ad6ef0c37534f80b55b4235108024b407f0b.story,"[(rape, 7), (police, 4), (arrest, 4), (thursda...","[(another, 1), (arrest, 1), (make, 1), (gang, ...","[(investigators, 1), (20, 1), (people, 1), (ta...","[(four, 1), (suspect, 1), (appear, 1), (court,...",,,,


### Calcul des inverse document frequencies et des tf-idf

In [8]:
flat_list = [item[0] for sublist in df["article_tf"].tolist() for item in sublist]
flat_list_sorted = sorted(flat_list)

In [9]:
import math

N = len(df.index)
for index in df.index:
    article_tf = df.loc[index,"article_tf"]
    if article_tf:
        df_list = [(word,flat_list_sorted.count(word)) for word in list(zip(*article_tf))[0]]  
        df.loc[index,"article_idf"] = [(item[0],math.log10(N/item[1])) for item in df_list]
        df.loc[index,"article_tf-idf"] = [(item[0][0],item[0][1]*item[1][1]) \
                                          for item in list(zip(article_tf,df.loc[index,"article_idf"]))]
    else:
        continue

In [10]:
df.head()

Unnamed: 0,article_tf,highlight1_tf,highlight2_tf,highlight3_tf,highlight4_tf,highlight5_tf,article_idf,article_tf-idf
0001d1afc246a7964130f43ae940af6bc6c57f01.story,"[(obama, 28), (syria, 25), (military, 17), (sy...","[(syrian, 1), (official, 1), (obama, 1), (clim...","[(obama, 1), (send, 1), (letter, 1), (head, 1)...","[(obama, 1), (seek, 1), (congressional, 1), (a...","[(aim, 1), (determine, 1), (whether, 1), (cw, ...",,"[(obama, 0.8143634230380883), (syria, 1.435728...","[(obama, 22.802175845066472), (syria, 35.89321..."
0002095e55fcbd3a2f366d9bf92a95433dc305ef.story,"[(bolt, 8), (gold, 7), (second, 7), (champions...","[(usain, 1), (bolt, 1), (win, 1), (third, 1), ...","[(anchors, 1), (jamaica, 1), (4x100m, 1), (rel...","[(eighth, 1), (gold, 1), (championship, 1), (b...","[(jamaica, 1), (double, 1), (woman, 1), (4x100...",,"[(bolt, 2.0), (gold, 1.6320232147054057), (sec...","[(bolt, 16.0), (gold, 11.42416250293784), (sec..."
00027e965c8264c35cc1bc55556db388da82b07f.story,"[(gsa, 26), (employee, 16), (kansas, 9), (city...","[(employee, 1), (agency, 1), (kansas, 1), (cit...","[(employee, 1), (travel, 1), (mainland, 1), (u...","[(telecommuting, 1), (program, 1), (like, 1), ...",,,"[(gsa, 2.4771212547196626), (employee, 1.33099...","[(gsa, 64.40515262271123), (employee, 21.29589..."
0002c17436637c4fe1837c935c04de47adb18e9a.story,"[(burkhart, 20), (fire, 18), (los, 10), (angel...","[(new, 1), (canadian, 1), (doctor, 1), (part, ...","[(new, 1), (diagnosis, 1), (autism, 1), (sever...","[(burkhart, 1), (also, 1), (suspect, 1), (germ...","[(prosecutors, 1), (believe, 1), (german, 1), ...",,"[(burkhart, 2.4771212547196626), (fire, 0.8973...","[(burkhart, 49.54242509439325), (fire, 16.1520..."
0003ad6ef0c37534f80b55b4235108024b407f0b.story,"[(rape, 7), (police, 4), (arrest, 4), (thursda...","[(another, 1), (arrest, 1), (make, 1), (gang, ...","[(investigators, 1), (20, 1), (people, 1), (ta...","[(four, 1), (suspect, 1), (appear, 1), (court,...",,,"[(rape, 1.7781512503836436), (police, 0.736758...","[(rape, 12.447058752685505), (police, 2.947034..."


In [11]:
df.describe()

Unnamed: 0,article_tf,highlight1_tf,highlight2_tf,highlight3_tf,highlight4_tf,highlight5_tf,article_idf,article_tf-idf
count,300,300,300,285,175,2,299,299
unique,300,300,300,285,175,2,299,299
top,"[(oil, 12), (bp, 11), (would, 8), (suttles, 8)...","[(louisiana, 1), (mostly, 1), (spar, 1), (sinc...","[(bp, 1), (hit, 1), (3,400, 1), (claim, 1), (l...","[(looking, 1), (forward, 1), (outright, 1), (i...","[(roommate, 1), (tyler, 1), (clementi, 1), (ki...","[(catholic, 1), (relief, 1), (services, 1), (d...","[(uganda, 2.0), (agency, 0.8143634230380883), ...","[(uganda, 10.0), (agency, 3.257453692152353), ..."
freq,1,1,1,1,1,1,1,1


In [12]:
#E.g. classement par tf-idf des termes de l'article 0001d1afc246a7964130f43ae940af6bc6c57f01.story
sorted(df.loc["0001d1afc246a7964130f43ae940af6bc6c57f01.story","article_tf-idf"], key=lambda x:x[1],reverse=True)

[('syria', 35.893214239035935),
 ('syrian', 23.785580060704262),
 ('obama', 22.802175845066472),
 ('chemical', 17.952255361759462),
 ('weapon', 15.884033745012633),
 ('military', 13.529960294849278),
 ('u.n.', 11.4858285564915),
 ('congress', 10.212287165077106),
 ('saturday', 9.392591607539382),
 ('action', 9.278012395037427),
 ('lawmaker', 8.862727528317974),
 ('cameron', 8.704365036222725),
 ('inspector', 8.614371417368625),
 ('authorization', 7.431363764158988),
 ('debate', 7.190205922601001),
 ('u.s.', 6.274544943364051),
 ('speech', 5.09200508825495),
 ('nesirky', 4.954242509439325),
 ('menace', 4.954242509439325),
 ('unfazed', 4.954242509439325),
 ('use', 4.951959185912927),
 ('attack', 4.7931069565498605),
 ('crisis', 4.619607839942973),
 ('president', 4.535940610079674),
 ('vote', 4.387640052032226),
 ('damascus', 4.352182518111363),
 ('assad', 4.352182518111363),
 ('russia', 4.307185708684313),
 ('british', 4.248591626995378),
 ('loom', 4.0),
 ('ally', 3.8190038161912128),
 (

### Enregistrement du nouveau jeu de données d’entraînement pour usage ultérieur

In [13]:
#On choisit de sauvegarder le dataframe en pickle
df.to_pickle("cnn_df.pkl") 

##La prochaine fois on pourra reloader ce dataframe avec:
#df = pd.read_pickle("cnn_df.pkl")