In [1]:
# Import libraries
from nntplib import NNTP
import numpy as np
import os

######################################

def find_unique_subjects_and_authors(items_original):
    
    # Empty list
    items_reformatted = []

    # Create new list of tuples with only relevent details
    for message_number, subject, author, date, message_id, references, size, lines in items_original:
        # Remove "Re: " in subject line
        if subject[0:4] == "Re: ":
            subject = subject[4:]
        items_reformatted.append((message_number, subject, author))

    # Convert to numpy array
    dtype = [('message_number', int), ('subject', 'S40'), ('author', 'S40')]
    items_array = np.array(items_reformatted, dtype = dtype)

    # Sort by subject
    items_array_sort = np.sort(items_array, axis=0, order="subject")
        
    # Iterate through subjects to find unique subjects and authors
    subject_number = 0
    subject_current = ''
    authors = []
    author_numbers = []
    items_array_new = []
    for item in items_array_sort:
        if item[1] != subject_current:
            subject_current = item[1]
            subject_number += 1
            authors = []
            author_numbers.append(0)
        if item[2] not in authors:
            authors.append(item[2])
            author_numbers[-1] += 1
        items_array_new.append((int(item[0]), subject_number, authors.index(item[2])+1))
        
    # Return array of unique subjects and authors
    return items_array_new, subject_number, author_numbers

######################################

def combine_conversations(items_to_combine, number_of_subjects):
    
    # Create blank array
    conversations = []
    for i in range(number_of_subjects):
        conversations.append('<s>')
    
    # Parse the message
    for message_number, subject_number, author_number in items_to_combine:
        
        # Read header and body
        resp, number, id, text = s.article(str(message_number))
        
        # Determine encoding
        encoding = 'none'
        for line in text:
            if line[0:18] == 'Content-Type: text':
                index = line.find('charset')
                if index != -1:
                    encoding = line[index+8:]
                    end = min(encoding.find(';'), encoding.find(' '))
                    if end != -1:
                        encoding = encoding[0:end]
                    # print "DEBUG: line + " === " + encoding
        encoding2 = 'none'
        for line in text:
            if line[0:27] == 'Content-Transfer-Encoding: ':
                encoding2 = line[27:]
        
        # print "DEBUG: line + " === " + encoding
        print "-- Message [" + str(message_number) + "] Subject [" + str(subject_number) + "] Author [" + str(author_number) + "]" + " Encoding [" + encoding + "," + encoding2 + "]"
        
        # Setup variables to parse body
        header = True
        finish = False
        output = ""

        # Parse body of post
        # ***************** THIS PART NEEDS EXCEPTION CATCHING AS IF ENCODING DOES NOT MATCH MESSAGE IT CRASHES *************
        for line_raw in text:
            # Decode text into UTF-8
            if encoding == 'none':
                line = line_raw.encode('utf-8')
            else:
                line = line_raw.decode(encoding).encode('utf-8')
            # Parse pre-signature limiter
            if line[0:2] == "--":
                finish = True    
            # Add line if appropriate
            if header == False and finish == False and line[0:1] != '>' and 'wrote:' not in line and 'a écrit :' not in line and len(line) > 0:
                # print line
                if len(output) == 0:
                    output = line
                else:
                    if output[-1:] == ' ':
                        output = output + line
                    else:
                        output = output + " " + line                        
            # Find end of header
            if line == '':
                header = False
                
        print "DEBUG: " + output[0:50]

        #print subject_number, author_number, output[0:39]
        conversations[subject_number - 1] += '<utt uid="' + str(author_number) + '">' + output + '</utt>'
        #print conversations[subject_number - 1]
        #print "DEBUG: " + output[0:40]

    # Insert end-limiter for conversations
    for i in range(number_of_subjects):
        conversations[i] += '</s>'

    return conversations

######################################

# Open news server
#s = NNTP('freenews.netfront.net', readermode=True)
s = NNTP('blaine.gmane.org', readermode=True)
#s = NNTP('news2.informatik.uni-stuttgart.de', readermode=True) - does not return groups
print "Opened connection to news server"

# Read pre-existing group list if available
if os.path.isfile('groups.txt'):
    french_groups = []
    with open('groups.txt') as f:
        for line in f:
            french_groups.append(line[:-1])
    print "- read " + str(len(french_groups)) + " French groups from file"
# Otherwise create from server
else: 
    # Read group list
    groups = s.list()
    print "- there are " + str(len(groups[1])) + " groups in total"

    # Create empty list for French groups
    french_groups = []

    # Find French groups
    for group in groups[1]:
        currgroup = group[0]
        #if currgroup[0:2] == 'fr':
        if 'french' in currgroup:
            french_groups.append(group[0])
    print "- there are " + str(len(french_groups)) + " groups in French"

    # Save French groups
    with open('groups.txt', 'w') as f:
        for group in french_groups:
            f.write(group + "\n")
    print "- written to file"

# Open file to write
f = open('samplegroup_fra.xml', 'w')
g = open('stats_groups.csv', 'w')
h = open('stats_message.csv', 'w')

# Iterate through French groups (only subset at present)
for group in french_groups[0:5]:
    # ********** THIS NEEDS TO BE CHANGED TO LOOK AT ALL GROUPS FOR FINAL VERSION ************
    
    # Read number of messages in group
    resp, count, first, last, name = s.group(group)
    print "Group [" + group + "] has " + count + " articles (" + first + ", " + last + ")"
    
    # Read items info from group
    print "- Reading items"
    # DEBUG - ******** THIS NEEDS TO BE REMOVED, IT JUST LOOKS AT LAST 200 MESSAGES TO SAVE TIME FOR NOW *********
    if int(last)-int(first) > 200:
        first = str(int(last)-200)
        print "-- DEBUG: Truncating to (" + first + "," + last + ")"
    # DEBUG
    resp, items = s.xover(first, last)
        
    # Find unique subjects and authors
    print "- Sorting items"
    items_unique, subject_number, author_numbers = find_unique_subjects_and_authors(items)
    print "-- There are " + str(subject_number) + " unique subjects in this forum"
    
    # ******* NEED TO WRITE THE GROUP, NUMBER OF MESSAGES etc. TO A FILE g FOR STATISTICS *******
    
    # Combine conversations
    print "- Combining conversations"
    conversations = combine_conversations(items_unique, subject_number)
    
    # Write conversations with 2 or more authors
    index = 0
    for dialog in conversations:
        if author_numbers[index] >= 2:
            f.write('<dialog>\n')
            f.write(dialog + '\n')
            f.write('</dialog>\n')
            print "-- Writing conversation on subject " + str(index + 1)
            print "DEBUG: " + dialog
        index += 1
        
    # ****** NEED TO WRITE TO FILE h FOR EACH CONVERSATION THAT IS WRITTEN THE NUMBER OF AUTHORS OR SOME OTHER STATS *******
    
# Close file
f.close()
g.close()
h.close()

Opened connection to news server
- read 79 French groups from file
Group [gmane.linux.debian.user.french] has 204072 articles (1, 205144)
- Reading items
-- DEBUG: Truncating to (204944,205144)
- Sorting items
-- There are 72 unique subjects in this forum
- Combining conversations
-- Message [205141] Subject [1] Author [1] Encoding [none,8BIT]


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe9 in position 6: ordinal not in range(128)

In [13]:
s = NNTP('blaine.gmane.org', readermode=True)
resp, count, first, last, name = s.group('gmane.linux.debian.user.french')
resp, number, id, text = s.article(str(205010))
resp, number, id, text2 = s.article(str(204965))

In [54]:
resp, number, id, text2 = s.article(str(35))
print text2

EOFError: 

In [14]:
for line in text:
    print line
    
print "---"

for line in text2:
    print line

Path: news.gmane.org!.POSTED!not-for-mail
From: roger.tarani@free.fr
Newsgroups: gmane.linux.debian.user.french
Subject: =?utf-8?Q?Re:_Cryptsetup_-_impossible_d'acc?=
 =?utf-8?Q?=C3=A9der_=C3=A0_une_machine_Jessie_chiffr=C3=A9e?=
Date: Tue, 12 Sep 2017 15:40:39 +0200 (CEST)
Lines: 27
Approved: news@gmane.org
Message-ID: <115376503.250275791.1505223639629.JavaMail.root@zimbra1-e1>
References: <325113268.250248781.1505223081974.JavaMail.root@zimbra1-e1>
NNTP-Posting-Host: blaine.gmane.org
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
X-Trace: blaine.gmane.org 1505223668 2840 195.159.176.226 (12 Sep 2017 13:41:08 GMT)
X-Complaints-To: usenet@blaine.gmane.org
NNTP-Posting-Date: Tue, 12 Sep 2017 13:41:08 +0000 (UTC)
To: Liste Debian <debian-user-french@lists.debian.org>
Original-X-From: bounce-debian-user-french=debian-user-french=m.gmane.org@lists.debian.org Tue Sep 12 15:41:02 2017
Return-path: <bounce-debian-user-french=debian-user-

In [38]:
import re
temp = "D'o=C3=B9 la remarque dans mon email pr=C3=A9c=C3=A9dent :=20"
print temp.decode('utf-8')
flag = True
while flag:
    match = re.search('(={1}[0-9A-F][0-9A-F]={1}[0-9A-F][0-9A-F])', temp)
    print type(match)
    if type(match) == 'NoneType':
        flag = False
    else:
        temp = temp[0:match.start()] + "?" + temp[match.start()+1:match.start()+3] + "?" + temp[match.start()+4:match.start()+6] + temp[match.start()+6:]
        print temp
print temp

D'o=C3=B9 la remarque dans mon email pr=C3=A9c=C3=A9dent :=20
<type '_sre.SRE_Match'>
D'o?C3?B9 la remarque dans mon email pr=C3=A9c=C3=A9dent :=20
<type '_sre.SRE_Match'>
D'o?C3?B9 la remarque dans mon email pr?C3?A9c=C3=A9dent :=20
<type '_sre.SRE_Match'>
D'o?C3?B9 la remarque dans mon email pr?C3?A9c?C3?A9dent :=20
<type 'NoneType'>


AttributeError: 'NoneType' object has no attribute 'start'