# Consigne

Vous devrez effectuer les opérations de traitement suivantes sur le texte, pas forcément dans cet ordre

- Créer des paires de document (article, highlights)
- Suppression de la ponctuation
- Séparation en token en minuscules
- Suppression des stopwords pour les articles
- Calcul des fréquences et tf-idf sur les deux types de documents
- Enregistrement du nouveau jeu de données d’entraînement pour usage ultérieur

In [1]:
from nltk import corpus, tokenize
corpus_root = './cnn/stories' 

In [2]:
wordlists = corpus.PlaintextCorpusReader(corpus_root, '.*')

len(wordlists.fileids())

92579

In [3]:
wordlists.fileids()[:5]

['0001d1afc246a7964130f43ae940af6bc6c57f01.story',
 '0002095e55fcbd3a2f366d9bf92a95433dc305ef.story',
 '00027e965c8264c35cc1bc55556db388da82b07f.story',
 '0002c17436637c4fe1837c935c04de47adb18e9a.story',
 '0003ad6ef0c37534f80b55b4235108024b407f0b.story']

In [4]:
wordlists.raw('0001d1afc246a7964130f43ae940af6bc6c57f01.story')

'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria.\n\nObama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.\n\nThe proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction."\n\nIt\'s a step that is set to turn an international crisis into a fierce domestic political battle.\n\nThere are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react?\n\nIn a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but 

### Fonction de séparation en token en minuscules + suppression des stopwords/éléments de ponctuation + lemmatization

In [5]:
import string
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

stopwords = set(stopwords.words('english'))|{"'s"}
punctuation = set(word_tokenize(string.punctuation))|{".","`","``","-","--"}
punctuation.remove("@")
#tokenizer = nltk.RegexpTokenizer('\w+')

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def preprocess(string):
    tmp = [WordNetLemmatizer().lemmatize(word, pos=penn2morphy(tag)) for word, tag in pos_tag(word_tokenize(string))]
    tmp = [str.lower(word) for word in tmp] 
    res = [word for word in tmp if word not in stopwords|punctuation]
    return res

### Création des paires de document (article, highlights) et calculs des term frequencies

In [6]:
# #décompte du nombre maximum d'highlights dans un article
# import re
# max_highlight = 0
# for fileid in wordlists.fileids():
#     highlight_freq = len(re.findall('(@highlight)',wordlists.raw(fileid)))
#     if highlight_freq > max_highlight:
#         max_highlight = highlight_freq

# print("At most there are {} highlights in an article".format(max_highlight)) 
# #on trouve max_highlight égal à 5

In [16]:
#on travaille sur un subset du corpus total pour limiter les temps de calcul
num_articles_to_work_on = 100 

#création du dataframe
import pandas as pd
columns=["article_tf","highlight1_tf","highlight2_tf","highlight3_tf","highlight4_tf","highlight5_tf"]
df = pd.DataFrame(index=wordlists.fileids()[:num_articles_to_work_on], columns=columns)

df.head()

Unnamed: 0,article_tf,highlight1_tf,highlight2_tf,highlight3_tf,highlight4_tf,highlight5_tf
0001d1afc246a7964130f43ae940af6bc6c57f01.story,,,,,,
0002095e55fcbd3a2f366d9bf92a95433dc305ef.story,,,,,,
00027e965c8264c35cc1bc55556db388da82b07f.story,,,,,,
0002c17436637c4fe1837c935c04de47adb18e9a.story,,,,,,
0003ad6ef0c37534f80b55b4235108024b407f0b.story,,,,,,


In [17]:
#remplissage du dataframe

from collections import Counter

for fileid in df.index:
    strings = wordlists.raw(fileid).split("@highlight")
    data = [Counter(preprocess(string)).most_common() for string in strings]
    df.loc[fileid].iloc[:len(data)] = data

df.head()

Unnamed: 0,article_tf,highlight1_tf,highlight2_tf,highlight3_tf,highlight4_tf,highlight5_tf
0001d1afc246a7964130f43ae940af6bc6c57f01.story,"[(obama, 28), (say, 28), (syria, 25), (militar...","[(syrian, 1), (official, 1), (obama, 1), (clim...","[(obama, 1), (send, 1), (letter, 1), (head, 1)...","[(obama, 1), (seek, 1), (congressional, 1), (a...","[(aim, 1), (determine, 1), (whether, 1), (cw, ...",
0002095e55fcbd3a2f366d9bf92a95433dc305ef.story,"[(bolt, 8), (gold, 7), (second, 7), (champions...","[(usain, 1), (bolt, 1), (win, 1), (third, 1), ...","[(anchors, 1), (jamaica, 1), (4x100m, 1), (rel...","[(eighth, 1), (gold, 1), (championship, 1), (b...","[(jamaica, 1), (double, 1), (woman, 1), (4x100...",
00027e965c8264c35cc1bc55556db388da82b07f.story,"[(gsa, 26), (employee, 16), (kansas, 9), (city...","[(employee, 1), (agency, 1), (kansas, 1), (cit...","[(employee, 1), (travel, 1), (mainland, 1), (u...","[(telecommuting, 1), (program, 1), (like, 1), ...",,
0002c17436637c4fe1837c935c04de47adb18e9a.story,"[(burkhart, 20), (fire, 18), (say, 17), (los, ...","[(new, 1), (canadian, 1), (doctor, 1), (say, 1...","[(new, 1), (diagnosis, 1), (autism, 1), (sever...","[(burkhart, 1), (also, 1), (suspect, 1), (germ...","[(prosecutors, 1), (believe, 1), (german, 1), ...",
0003ad6ef0c37534f80b55b4235108024b407f0b.story,"[(rape, 7), (police, 4), (arrest, 4), (thursda...","[(another, 1), (arrest, 1), (make, 1), (gang, ...","[(investigators, 1), (say, 1), (20, 1), (peopl...","[(four, 1), (suspect, 1), (appear, 1), (court,...",,


In [18]:
df.describe()

Unnamed: 0,article_tf,highlight1_tf,highlight2_tf,highlight3_tf,highlight4_tf,highlight5_tf
count,100,100,100,94,58,0.0
unique,100,100,100,94,58,0.0
top,"[(mubarak, 10), (charge, 8), (trial, 5), (year...","[(new, 1), (retrial, 1), (former, 1), (preside...","[(retrial, 1), (hold, 1), (mubarak, 1), (son, ...","[(vigil, 1), (hold, 1), (recent, 1), (effort, ...","[(steal, 1), (treasure, 1), (often, 1), (hard,...",
freq,1,1,1,1,1,


### Calcul des inverse document frequencies

In [None]:
#enlever les mots les plus présents partout

In [19]:
import math
import numpy as np
df["article_idf","highlight1_idf","highlight2_idf","highlight3_idf","highlight4_idf","highlight5_idf"] = np.NaN


def idf(tf, df):
    
    idf_list = [idf for i]
    return math.log10(len(df_tf.index)/len([x for x in occurence if  ]))

SyntaxError: invalid syntax (<ipython-input-19-4c276794fbe6>, line 6)

In [12]:
from nltk import FreqDist

FreqDist()


# Récupération des comptages
freq, stats, corpora = freq_stats_corpora()
df = pd.DataFrame.from_dict(stats, orient='index')

# Affichage des fréquences
df.sort(columns='total', ascending=False)
df.plot(kind='bar', color="#f56900", title='Top 50 Rappeurs par nombre de mots')

NameError: name 'freq_stats_corpora' is not defined

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
tfidf = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
 

NameError: name 'stop_words' is not defined

In [None]:
# Fit the TfIdf model
tfidf.fit([reuters.raw(file_id) for file_id in reuters.fileids()])
 
# Transform a document into TfIdf coordinates
X = tfidf.transform([reuters.raw('test/14829')])
 
 
# Check out some frequencies
print X[0, tfidf.vocabulary_['year']]                   # 0.0562524229373
print X[0, tfidf.vocabulary_['following']]              # 0.057140265658
print X[0, tfidf.vocabulary_['provided']]               # 0.0689364372666
print X[0, tfidf.vocabulary_['structural']]             # 0.0900802810906
print X[0, tfidf.vocabulary_['japanese']]               # 0.114492409303
print X[0, tfidf.vocabulary_['downtrend']]              # 0.111137191743

In [None]:
# Premièrement, on récupère la fréquence totale de chaque mot sur tout le corpus d'artistes
freq_totale = nltk.Counter()
for k, v in corpora.iteritems():
 freq_totale += freq[k]

# Deuxièmement on décide manière un peu arbitraire du nombre de mots les plus fréquents à supprimer. On pourrait afficher un graphe d'évolution du nombre de mots pour se rendre compte et avoir une meilleure heuristique. 
most_freq = zip(*freq2.most_common(100))[0]

# On créé notre set de stopwords final qui cumule ainsi les 100 mots les plus fréquents du corpus ainsi que l'ensemble de stopwords par défaut présent dans la librairie NLTK
sw = set()
sw.update(stopwords)
sw.update(tuple(nltk.corpus.stopwords.words('french')))

### Enregistrement du nouveau jeu de données d’entraînement pour usage ultérieur