In [15]:
# Import libraries
from nntplib import NNTP
import numpy as np
import os
import csv
######################################

def find_unique_subjects_and_authors(items_original):
    
    # Empty list
    items_reformatted = []

    # Create new list of tuples with only relevent details
    for message_number, subject, author, date, message_id, references, size, lines in items_original:
        # Remove "Re: " in subject line
        if subject[0:4] == "Re: ":
            subject = subject[4:]
        items_reformatted.append((message_number, subject, author))

    # Convert to numpy array
    dtype = [('message_number', int), ('subject', 'S40'), ('author', 'S40')]
    items_array = np.array(items_reformatted, dtype = dtype)

    # Sort by subject
    items_array_sort = np.sort(items_array, axis=0, order="subject")
        
    # Iterate through subjects to find unique subjects and authors
    subject_number = 0
    subject_current = ''
    authors = []
    author_numbers = []
    items_array_new = []
    for item in items_array_sort:
        if item[1] != subject_current:
            subject_current = item[1]
            subject_number += 1
            authors = []
            author_numbers.append(0)
        if item[2] not in authors:
            authors.append(item[2])
            author_numbers[-1] += 1
        items_array_new.append((int(item[0]), subject_number, authors.index(item[2])+1))
        
    # Return array of unique subjects and authors
    return items_array_new, subject_number, author_numbers

######################################

def combine_conversations(items_to_combine, number_of_subjects):
    
    # Create blank array
    conversations = []
    for i in range(number_of_subjects):
        conversations.append('<s>')
    
    # Parse the message
    for message_number, subject_number, author_number in items_to_combine:
        
        # Read header and body
        resp, number, id, text = s.article(str(message_number))
        
        # Determine encoding
        encoding = 'none'
        for line in text:
            if line[0:18] == 'Content-Type: text':
                index = line.find('charset')
                if index != -1:
                    encoding = line[index+8:]
                    end = min(encoding.find(';'), encoding.find(' '))
                    if end != -1:
                        encoding = encoding[0:end]
                    # print "DEBUG: line + " === " + encoding
        encoding2 = 'none'
        for line in text:
            if line[0:27] == 'Content-Transfer-Encoding: ':
                encoding2 = line[27:]
        
        # print "DEBUG: line + " === " + encoding
        print "-- Message [" + str(message_number) + "] Subject [" + str(subject_number) + "] Author [" + str(author_number) + "]" + " Encoding [" + encoding + "," + encoding2 + "]"
        
        # Setup variables to parse body
        header = True
        finish = False
        output = ""

        # Parse body of post
        # ***************** THIS PART NEEDS EXCEPTION CATCHING AS IF ENCODING DOES NOT MATCH MESSAGE IT CRASHES *************
        try:
            for line_raw in text:
                # Decode text into UTF-8
                if encoding == 'none':
                    line = line_raw.encode('utf-8')
                else:
                    line = line_raw.decode(encoding).encode('utf-8')
                # Parse pre-signature limiter
                if line[0:2] == "--":
                    finish = True    
                # Add line if appropriate
                if header == False and finish == False and line[0:1] != '>' and 'wrote:' not in line and 'a écrit :' not in line and len(line) > 0:
                    # print line
                    if len(output) == 0:
                        output = line
                    else:
                        if output[-1:] == ' ':
                            output = output + line
                        else:
                            output = output + " " + line                        
                # Find end of header
                if line == '':
                    header = False
        except UnicodeDecodeError:
            print " -- -- -- Unicode Decode Error caught -- -- --"
        except LookupError:
            print " -- -- -- Lookup Error caught -- -- --"
        print "DEBUG: " + output[0:50]

        #print subject_number, author_number, output[0:39]
        conversations[subject_number - 1] += '<utt uid="' + str(author_number) + '">' + output + '</utt>'
        #print conversations[subject_number - 1]
        #print "DEBUG: " + output[0:40]

    # Insert end-limiter for conversations
    for i in range(number_of_subjects):
        conversations[i] += '</s>'

    return conversations

######################################

# Open news server
#s = NNTP('freenews.netfront.net', readermode=True)
s = NNTP('blaine.gmane.org', readermode=True)
#s = NNTP('news2.informatik.uni-stuttgart.de', readermode=True) - does not return groups
print "Opened connection to news server"

# Read pre-existing group list if available
if os.path.isfile('groups.txt'):
    french_groups = []
    with open('groups.txt') as f:
        for line in f:
            french_groups.append(line[:-1])
    print "- read " + str(len(french_groups)) + " French groups from file"
# Otherwise create from server
else: 
    # Read group list
    groups = s.list()
    print "- there are " + str(len(groups[1])) + " groups in total"

    # Create empty list for French groups
    french_groups = []

    # Find French groups
    for group in groups[1]:
        currgroup = group[0]
        #if currgroup[0:2] == 'fr':
        if 'french' in currgroup:
            french_groups.append(group[0])
    print "- there are " + str(len(french_groups)) + " groups in French"

    # Save French groups
    with open('groups.txt', 'w') as f:
        for group in french_groups:
            f.write(group + "\n")
    print "- written to file"

# Open file to write
f = open('samplegroup_fra.xml', 'w')
g = open('stats_message.csv', 'wb')

group_writer = csv.writer(g)
group_writer.writerow(['Group', 'No. of messages', 'No. of authors']) 
# Iterate through French groups (only subset at present)

for group in french_groups[0:5]:
    # ********** THIS NEEDS TO BE CHANGED TO LOOK AT ALL GROUPS FOR FINAL VERSION ************
    
    # Read number of messages in group
    resp, count, first, last, name = s.group(group)
    print "Group [" + group + "] has " + count + " articles (" + first + ", " + last + ")"
    
    # Read items info from group
    print "- Reading items"
    
    # DEBUG - ******** THIS NEEDS TO BE REMOVED, IT JUST LOOKS AT LAST 50 MESSAGES TO SAVE TIME FOR NOW *********
    if int(last)-int(first) > 200:
        first = str(int(last)-200)
        print "-- DEBUG: Truncating to (" + first + "," + last + ")"
    # DEBUG
    
    resp, items = s.xover(first, last)
        
    # Find unique subjects and authors
    print "- Sorting items"
    items_unique, subject_number, author_numbers = find_unique_subjects_and_authors(items)
    print "-- There are " + str(subject_number) + " unique subjects in this forum"
    
    # ******* NEED TO WRITE THE GROUP, NUMBER OF MESSAGES etc. TO A FILE g FOR STATISTICS *******
    group_writer.writerow([group, subject_number, sum(author_numbers)])
    
    # Combine conversations
    print "- Combining conversations"
    conversations = combine_conversations(items_unique, subject_number)
    
    # creat h stats for each group
    hname="stats_author_"+group+".csv"
    h=open(hname,'wb')
    author_writer = csv.writer(h)
    author_writer.writerow(['Conversation no.', 'No. of authors']) 
        
    index = 0
    counter = 1
    for dialog in conversations:
        if author_numbers[index] >= 2:
            f.write('<dialog>\n')
            f.write(dialog + '\n')
            f.write('</dialog>\n')
            print "-- Writing conversation on subject " + str(index + 1)
            print "DEBUG: " + dialog
            author_writer.writerow([counter,author_numbers[index]])
            counter += 1
        index += 1
        
    # ****** Written for now conversation number and number of authors in that conversation (added in above code) *******
    
# Close file
f.close()
g.close()
h.close()

Opened connection to news server
- read 79 French groups from file
Group [gmane.linux.debian.user.french] has 204097 articles (1, 205169)
- Reading items
-- DEBUG: Truncating to (204969,205169)
- Sorting items
-- There are 75 unique subjects in this forum
- Combining conversations
-- Message [205141] Subject [1] Author [1] Encoding [none,8BIT]
 -- -- -- Unicode Decode Error caught -- -- --
DEBUG:   This message is in MIME format.  The first part 
-- Message [205142] Subject [2] Author [1] Encoding ["UTF-8",8bit]
DEBUG: Je ne reproche rien à JFS, sinon d'être populair
-- Message [205165] Subject [3] Author [1] Encoding ["UTF-8",8bit]
DEBUG: Bonjour Ca fait plaisir de voir des messages de su
-- Message [205057] Subject [4] Author [1] Encoding ["UTF-8",8bit]
DEBUG: Bonjour Si Network Manager est isntallé, in peut 
-- Message [205139] Subject [5] Author [1] Encoding ["UTF-8",8bit]
DEBUG: Bonjour Quelques questions: Les SSD prennent ils e
-- Message [205160] Subject [5] Author [1] Encoding 

-- Message [204972] Subject [33] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Un gars =C3=A9voque le fait qu'un liveCD a endomma
-- Message [204976] Subject [33] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour Revoici le lien (copi=C3=A9 dans la barre 
-- Message [204981] Subject [33] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour Concr=C3=A8tement, comment faut-il proc=C3
-- Message [205010] Subject [33] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: PS L'=C3=A9quipe Cryptsetup avait r=C3=A9pondu =C3
-- Message [205011] Subject [33] Author [2] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Pour la g=C3=A9n=C3=A9ration d'un bug : -
-- Message [205012] Subject [33] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Dans mon cas, la machine a =C3=A9t=C3=A9 
-- Message [205013] Subject [33] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Dans mon cas, la machine a =C3=A9t=C3=A9 
-- Message [205028] Subject [33] Author [1] Enco

-- Message [205058] Subject [58] Author [2] Encoding ["UTF-8",8bit]
DEBUG: Le jeudi 14 septembre 2017 à 10:33 +0200, David B
-- Message [205059] Subject [58] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour Jérôme, Pour répondre à ta première q
-- Message [205060] Subject [58] Author [3] Encoding [utf-8,8bit]
DEBUG: Bonsoir, Je suis plutôt interface graphique mais 
-- Message [205062] Subject [58] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour Yannick, Le 14/09/2017 à 17:19, Yannick a
-- Message [205065] Subject [58] Author [4] Encoding [UTF-8,quoted-printable]
DEBUG: Le Thu, 14 Sep 2017 17:42:16 +0200, David BERCOT <
-- Message [205069] Subject [58] Author [5] Encoding [utf-8,8bit]
DEBUG: Le 14/09/2017 à 10:33, David BERCOT a écrit : [
-- Message [205071] Subject [58] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Le 14/09/2017 à 18:02, Haricophile a éc
-- Message [205072] Subject [58] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Le 15/09/2017 à 09:23, didier gaumet a �
-- Messa

Group [gmane.linux.debian.devel.french] has 4332 articles (1, 4658)
- Reading items
-- DEBUG: Truncating to (4458,4658)
- Sorting items
-- There are 129 unique subjects in this forum
- Combining conversations
-- Message [4467] Subject [1] Author [1] Encoding ["UTF-8",quoted-printable]
DEBUG: Le mercredi 30 avril 2014 =C3=A0 15:03 +0200, Benj
-- Message [4466] Subject [2] Author [1] Encoding ["ISO-8859-1",quoted-printable]
DEBUG: Message en plusieurs parties au format MIME
-- Message [4515] Subject [3] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Bonjour Sur un certain nombre de serveurs, j'ai un
-- Message [4516] Subject [3] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Le 16/12/2014 10:57, Laurent COOPER a =E9crit : ro
-- Message [4543] Subject [4] Author [1] Encoding ["utf-8",7bit]
DEBUG: 
-- Message [4623] Subject [5] Author [1] Encoding [utf-8,base64]
DEBUG: 
-- Message [4558] Subject [6] Author [1] Encoding ["utf-8",quoted-printable]
DEBUG: Le jeudi 1 octobr

-- Message [4500] Subject [50] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Rebonjour Je me r=E9ponds =E0 moi m=EAme, =E7a pou
-- Message [4501] Subject [50] Author [2] Encoding ["UTF-8",quoted-printable]
DEBUG: Le vendredi 07 novembre 2014 =C3=A0 16:31 +0100, L
-- Message [4487] Subject [51] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Bonjour, je suis en train de monter un LXC sur une
-- Message [4488] Subject [51] Author [2] Encoding [windows-1252,quoted-printable]
DEBUG: ) OK, merci du retour. N'h=E9sites pas =E0 mettre 
-- Message [4489] Subject [51] Author [3] Encoding [windows-1252,quoted-printable]
DEBUG: Le 13/08/2014 17:25, Sylvestre Ledru a =E9crit : y
-- Message [4582] Subject [52] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour Je risque, b=C3=AAtement, de vouloir me la
-- Message [4583] Subject [52] Author [2] Encoding [iso-8859-1,quoted-printable]
DEBUG: Le Wed, Feb 03, 2016 at 12:28:15PM +0100, Laurent 
-- Message [4629] Subject [53] 

-- Message [4542] Subject [99] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Salut. Pour info, je confirme l'annonce, et remerc
-- Message [4548] Subject [99] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Salut. Pour info, l'enregistrement de la conf est 
-- Message [4505] Subject [100] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: This is a multi-part message in MIME format.
-- Message [4508] Subject [100] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: This is a multi-part message in MIME format.
-- Message [4580] Subject [101] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: 
-- Message [4588] Subject [102] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Une commune de proche-banlieue parisienne
-- Message [4589] Subject [102] Author [2] Encoding [UTF-8,quoted-printable]
DEBUG: Bonjour C'est possible de connaitre la commune? Co
-- Message [4590] Subject [102] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Etant une demande "r=

- Sorting items
-- There are 166 unique subjects in this forum
- Combining conversations
-- Message [80681] Subject [1] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Comme nous sommes en septembre, il est d�
-- Message [80690] Subject [2] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Terminé. Merci à Alban pour sa relectur
-- Message [80692] Subject [2] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Terminé. Merci à François, Baptiste et
-- Message [80646] Subject [3] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80644] Subject [4] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80752] Subject [5] Author [1] Encoding ["us-ascii",quoted-printable]
DEBUG: 
-- Message [80630] Subject [6] Author [1] Encoding ["utf-8",binary]
DEBUG: This is a multi-part message in MIME format.
-- Message [80682] Subject [6] Author [1] Encoding ["utf-8",binary]
DEBUG: This is a multi-part message in MI

-- Message [80714] Subject [64] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Dans la suite des paquets OpenStack, une 
-- Message [80772] Subject [65] Author [1] Encoding [utf-8,quoted-printable]
DEBUG: Bonjour, Dans la suite des paquets d'OpenStack, je
-- Message [80774] Subject [66] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, La troisième édition de l'année des «
-- Message [80708] Subject [67] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, L'annonce de la première version Alpha d
-- Message [80643] Subject [68] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Dédoublonné ligne 90 (option * 2), merc
-- Message [80647] Subject [69] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Espace protégée corrigée. Merci pour v
-- Message [80645] Subject [70] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80617] Subject [71] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Balise corrigée. Merci pour vos nouvelle
-- Messag

-- Message [80655] Subject [128] Author [1] Encoding [UTF-8;,base64]
DEBUG: This is a multi-part message in MIME format.
-- Message [80619] Subject [129] Author [1] Encoding [utf-8,8bit]
DEBUG: Bonjour, Merci Jean-Paul, préférence appliquée.
-- Message [80785] Subject [130] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80787] Subject [130] Author [2] Encoding [UTF-8,quoted-printable]
DEBUG: 
-- Message [80809] Subject [131] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80763] Subject [132] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80788] Subject [133] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME format.
-- Message [80806] Subject [133] Author [2] Encoding [UTF-8,quoted-printable]
DEBUG: 
-- Message [80808] Subject [134] Author [1] Encoding [utf-8,8bit]
DEBUG: This is a multi-part message in MIME form

- Sorting items
-- There are 75 unique subjects in this forum
- Combining conversations
-- Message [69] Subject [1] Author [1] Encoding ["utf-8",none]
DEBUG: 1 juillet 2005. Avant l'été, la FSF France publi
-- Message [77] Subject [2] Author [1] Encoding ["utf-8",none]
DEBUG: 8 décembre 2005. Communiqué de presse AFUL/APRIL
-- Message [74] Subject [3] Author [1] Encoding ["utf-8",none]
DEBUG: 24 novembre 2005. Le gouvernement français a déc
-- Message [68] Subject [4] Author [1] Encoding ["utf-8",none]
DEBUG: 28 juin 2005. Les associations ADULLACT, AFUL, APR
-- Message [82] Subject [5] Author [1] Encoding ["utf-8",none]
DEBUG: 9 mars 2006. Paris, le 9 mars 2006. Communiqué de
-- Message [84] Subject [6] Author [1] Encoding ["utf-8",none]
DEBUG: 20 mars 2006. Paris, le 20 mars 2006. Communiqué 
-- Message [64] Subject [7] Author [1] Encoding ["utf-8",none]
DEBUG: 11 mai 2005. A l'occasion d'une interview avec le 
-- Message [80] Subject [8] Author [1] Encoding ["utf-8",none]
DEBUG: 10 

-- Message [62] Subject [71] Author [1] Encoding ["utf-8",none]
DEBUG: 1er mars 2005. L'Association Francophone des Utili
-- Message [76] Subject [72] Author [1] Encoding ["utf-8",none]
DEBUG: 2 décembre 2005. La FSF France a signé la pétit
-- Message [58] Subject [73] Author [1] Encoding [iso-8859-1,quoted-printable]
DEBUG: 20 avril 2004. Manifestation/concert de la Place d
-- Message [85] Subject [74] Author [1] Encoding ["utf-8",none]
DEBUG: 27 mars 2006. Paris, le lundi 27 mars 2006, [EUCD.
-- Message [59] Subject [75] Author [1] Encoding [iso-8859-1,quoted-printable]
DEBUG: 9 juin 2004. A quelques jours de l'examen pr=E9vu 
Group [gmane.org.w3c.translators.french] has 321 articles (1, 322)
- Reading items
-- DEBUG: Truncating to (122,322)
- Sorting items
-- There are 159 unique subjects in this forum
- Combining conversations
-- Message [190] Subject [1] Author [1] Encoding [US-ASCII,7bit]
DEBUG: Hi, I'm having the hardest time translating (from 
-- Message [191] Subject [1] Autho

-- Message [177] Subject [63] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: 
-- Message [178] Subject [63] Author [2] Encoding [UTF-8,quoted-printable]
DEBUG: Karl Dubost 16/02/05 20:06 -0500: Karl, A priori o
-- Message [213] Subject [64] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Bonjour Un nouveau document =E0 propos des traduct
-- Message [214] Subject [64] Author [2] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Scribit Karl Dubost dies 11/11/2005 hora 10:38: J'
-- Message [215] Subject [64] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Bonjour Pierre, D=E9sol=E9 pour le d=E9lai dans la
-- Message [216] Subject [64] Author [2] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Scribit Karl Dubost dies 18/11/2005 hora 14:44: Si
-- Message [217] Subject [64] Author [3] Encoding [ISO-8859-1,quoted-printable]
DEBUG: bonjour =20 Comment soumet-on une traduction au W3
-- Message [218] Subject [64] Author [4] Encoding [ISO-8859-1,quoted-printable]
DEBUG: 
-

-- Message [272] Subject [109] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [273] Subject [109] Author [2] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Le 20 mai 2008 =E0 01:01, Jean-Jacques SOLARI a =E
-- Message [263] Subject [110] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [259] Subject [111] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [261] Subject [112] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [265] Subject [113] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [279] Subject [114] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [275] Subject [115] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [277] Subject [116] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [281] Subject [117] Author [1] Encoding [none,base64]
DEBUG: 
-- Message [180] Subject [118] Author [1] Encoding [ISO-8859-1,quoted-printable]
DEBUG: Salut =E0 tous, Voici l'annonce d'ach=E8vement d'u
-- Message [157] Subject [119] Author [1] Encoding [UTF-8,quoted-

In [13]:
hname="stats_author_"+group+".csv"
h=open(hname,'wb')
author_writer = csv.writer(h)
author_writer.writerow(['Conversation no.', 'No. of authors']) 