In [9]:
# NNTP library
from nntplib import NNTP
import numpy as np

def find_unique_subjects_and_authors(items_original):
    
    # Empty list
    items_reformatted = []

    # Create new list of tuples with only relevent details
    for message_number, subject, author, date, message_id, references, size, lines in items_original:
        # Remove "Re: " in subject line
        if subject[0:4] == "Re: ":
            subject = subject[4:]
        #print subject
        items_reformatted.append((message_number, subject, author))

    # Convert to numpy array
    dtype = [('message_number', int), ('subject', 'S40'), ('author', 'S40')]
    items_array = np.array(items_reformatted, dtype = dtype)

    # Sort by subject
    items_array_sort = np.sort(items_array, axis=0, order="subject")
    #print items_array_sort
    
    # Iterate through subjects to find unique subjects and authors
    subject_number = 0
    subject_current = ''
    authors = []
    author_numbers = []
    items_array_new = []
    for item in items_array_sort:
        if item[1] != subject_current:
            subject_current = item[1]
            subject_number += 1
            authors = []
            author_numbers.append(0)
        if item[2] not in authors:
            authors.append(item[2])
            author_numbers[-1] += 1
        items_array_new.append((item[0], subject_number, authors.index(item[2])+1))
        
    # Return array of unique subjects and authors
    return items_array_new, subject_number, author_numbers

def combine_conversations(items, number_of_subjects):
    
    # Create blank array
    conversations = []
    for i in range(number_of_subjects):
        conversations.append('<s>')
    
    # Parse the message
    for message_number, subject_number, author_number in items:

        print "--- Message [" + str(message_number) + "] Subject [" + str(subject_number) + "] Author [" + str(author_number) + "]"
        resp, number, id, text = s.body(str(message_number))
        
        # header = True
        finish = False
        output = ""

        # Parse text of post
        for line in text:
            # Parse pre-signature limiter
            if line[0:2] == "--":
                finish = True
            # Add line if appropriate
            if finish == False and line[0:1] != '>' and 'wrote:' not in line and 'a écrit :' not in line and 'a �crit :' not in line and len(line) > 0:
                # print line
                if len(output) == 0:
                    output = line
                else:
                    if output[-1:] == ' ':
                        output = output + line
                    else:
                        output = output + " " + line

        #print subject_number, author_number, output[0:39]
        conversations[subject_number - 1] += '<utt uid="' + str(author_number) + '">' + output + '</utt>'
        #print conversations[subject_number - 1]
        #print "OUTPUT " + output

    # Insert end-limiter for conversations
    for i in range(number_of_subjects):
        conversations[i] += '</s>'

    return conversations

######################################

# Open news server and read group list
s = NNTP('freenews.netfront.net', readermode=True)
groups = s.list()
print "There are " + str(len(groups[1])) + " groups in total"

# Create empty list for French groups
french_groups = []

# Find French groups
for group in groups[1]:
    currgroup = group[0]
    if currgroup[0:2] == 'fr':
        french_groups.append(group[0])
print "There are " + str(len(french_groups)) + " French groups"

# Open file to write
f = open('samplegroup_fra.xml', 'w')

# Iterate through French groups (only subset at present)
for group in french_groups[3:7]:
    
    # Read number of messages in group
    resp, count, first, last, name = s.group(group)
    print "- Group " + group + " has " + str(count) + " articles ranging from " + str(first) + " to " + str(last)
    
    # Read items info from group
    resp, items = s.xover(first, last)
    
    # Check unique subjects and authors
    items_array_new, subject_number, author_numbers = find_unique_subjects_and_authors(items)
    print "-- There are " + str(subject_number) + " subjects in this forum"
    
    # Combine conversations
    conversations = combine_conversations(items_array_new, subject_number)
    
    # Write conversations with 2 or more authors
    index = 0
    for dialog in conversations:
        if author_numbers[index] >= 2:
            f.write('<dialog>\n')
            f.write(dialog + '\n')
            f.write('</dialog>\n')
            print "---- Writing conversation on subject " + str(index + 1)
        index += 1
    
# Close file
f.close()

There are 47261 groups in total
There are 4004 French groups
- Group fr.bio.medecine.veterinaire has 7 articles ranging from 49 to 86
-- There are 5 subjects in this forum
--- Message [70] Subject [1] Author [1]
--- Message [68] Subject [2] Author [1]
--- Message [65] Subject [3] Author [1]
--- Message [49] Subject [4] Author [1]
--- Message [50] Subject [4] Author [1]
--- Message [86] Subject [5] Author [1]
- Group fr.bio.pharmacie has 9 articles ranging from 60 to 110
-- There are 8 subjects in this forum
--- Message [99] Subject [1] Author [1]
--- Message [110] Subject [2] Author [1]
--- Message [101] Subject [3] Author [1]
--- Message [60] Subject [4] Author [1]
--- Message [69] Subject [5] Author [1]
--- Message [109] Subject [6] Author [1]
--- Message [88] Subject [7] Author [1]
--- Message [89] Subject [7] Author [2]
--- Message [100] Subject [8] Author [1]
---- Writing conversation on subject 6
- Group fr.comp.algorithmes has 2 articles ranging from 79 to 93
-- There are 2 subj

In [184]:
for i in range(4):
    print i, conversations[i]

0 <s><utt uid="1">...........................................................</utt></s>
1 <s><utt uid="1">Le 21/01/2016 09:16, Pierre-Alain Dorange a �crit : Oui... L'avantage de ma fa�on de faire actuelle avec OpenOffice c'est que je ne me pose pas vraiment de question sur la p�rennit� du format, et qu'en plus je peux facilement y acc�der depuis un smartphone (les docs sur sur dropbox, et on trouve facilement des readers OpenDocument). J'esp�rais qu'il existait un standard ouvert de m�me genre pour des notes/fiches organis�es, mais �a n'a pas trop l'air d'�tre le cas. Le fait de stocker les notes en XML ne change rien si la structure du XML est sp�cifique � l'appli utilis�e.</utt><utt uid="1">Evernote ou OneNote, c'est un peu le même combat :-). En fait psychologiquement j'ai encore du mal à confier des documents "sérieux" à un service purement cloud-based. La seule exception que je fais c'est mon carnet de contacts gmail, mais je le backupe régulièrement en CSV ! Le reste (sur Google

In [216]:
print s.article('215')
print s.article('229')

('220 215 <dgfo5mF8p9sU1@mid.individual.net> article', '215', '<dgfo5mF8p9sU1@mid.individual.net>', ['Path: news.netfront.net!news.glorb.com!fu-berlin.de!uni-berlin.de!individual.net!not-for-mail', 'From: pehache <pehache.7@gmail.com>', 'Newsgroups: fr.comp.applications.bureautique,fr.comp.os.mac-os.x', 'Subject: Re: Logiciel de notes avec format ouvert et multiplateforme', 'Date: Fri, 22 Jan 2016 23:18:30 +0100', 'Lines: 33', 'Message-ID: <dgfo5mF8p9sU1@mid.individual.net>', 'References: <dg17a2FjavoU1@mid.individual.net>', ' <1mh6h8x.105z04r1l5d86eN%josephb@nowhere.invalid>', ' <dg24uoFr0t6U2@mid.individual.net>', ' <1mh6p1l.8gglc318q1kxiN%jeff@-noreply-lecanet.com>', ' <1mh6sj2.144ns8rs2npaN%eric.hamery@metamaitre.com>', ' <1mh8upj.wxg2x11oi7x5bN%jeff@-noreply-lecanet.com>', ' <1mhdb05.dga6n11c1326vN%pdorange@pas-de-pub-merci.mac.com>', 'Mime-Version: 1.0', 'Content-Type: text/plain; charset=windows-1252; format=flowed', 'Content-Transfer-Encoding: 8bit', 'X-Trace: individual.net cG

In [213]:
resp, number, id, text = s.body('215')
resp, number, id, text2 = s.body('229')
print text

a = '\xc3\xa9crit'
print type(a)
print a

['Le 21/01/2016 09:16, Pierre-Alain Dorange a \xe9crit :', '> Jean-Francois Gautier <jeff@-noreply-lecanet.com> wrote:', '>', ">> J'avais jamais fait attention au fait qu'Evernote avec un format XML", ">> propr\xe9taire ! Avec un peu d'astuce, on doit m\xeame \xeatre capable de", ">> r\xe9cup\xe9rer son DTD et l'exploiter pour l'importer dans des logiciels de", '>> gestion documentaires plus aboutis', '>', '> Ca restera (si tu arrives a faire du reverse-ingeneering) propri\xe9taire', '> et tu es \xe0 la merci de mises \xe0 jour qui changent tout et rende ton', "> \xe9ventuelle moulinette obsol\xe8te... C'est tout le probl\xe8me du", '> propri\xe9taire.', '>', '> La meilleure solution resterait un petit logiciel en surcouche de', '> Open/LibreOffice. LibreOffice comme logiciel avec format ouvert,', '> standardis\xe9 et r\xe9utilisable, g\xe9rant tout aussi bien du texte, des', '> tableurs et du dessin.', "> Et une surcouche (qui n'existe pas je crois) pour regrouper les", '> documents a

In [215]:
for line in text:
    print line
    print line.decode('latin1')
    
for line in text2:
    print line
    print line.decode('latin1')
    

Le 21/01/2016 09:16, Pierre-Alain Dorange a �crit :
Le 21/01/2016 09:16, Pierre-Alain Dorange a écrit :
> Jean-Francois Gautier <jeff@-noreply-lecanet.com> wrote:
> Jean-Francois Gautier <jeff@-noreply-lecanet.com> wrote:
>
>
>> J'avais jamais fait attention au fait qu'Evernote avec un format XML
>> J'avais jamais fait attention au fait qu'Evernote avec un format XML
>> propr�taire ! Avec un peu d'astuce, on doit m�me �tre capable de
>> proprétaire ! Avec un peu d'astuce, on doit même être capable de
>> r�cup�rer son DTD et l'exploiter pour l'importer dans des logiciels de
>> récupérer son DTD et l'exploiter pour l'importer dans des logiciels de
>> gestion documentaires plus aboutis
>> gestion documentaires plus aboutis
>
>
> Ca restera (si tu arrives a faire du reverse-ingeneering) propri�taire
> Ca restera (si tu arrives a faire du reverse-ingeneering) propriétaire
> et tu es � la merci de mises � jour qui changent tout et rende ton
> et tu es à la merci de mises à jour qui changent 

In [191]:
print '\xe9'.decode('latin1').encode('utf8')

é


In [192]:
print 'la p\xe9rennit\xe9'.decode('latin1').encode('utf8')

la pérennité


In [228]:
import chardet
encoding1 = chardet.detect('r\xe9cup\xe9rer son DTD et l'exploiter pour')
encoding1['encoding']
encoding2 = chardet.detect('facilement cr\xc3\xa9er des liens vers de documents plus fournis')
encoding2['encoding']

'ISO-8859-1'

In [232]:
print chardet.detect(text[0])
print chardet.detect(text2[0])

{'confidence': 0.73, 'language': '', 'encoding': 'ISO-8859-1'}
{'confidence': 0.73, 'language': '', 'encoding': 'ISO-8859-1'}
