In [20]:
%load_ext autoreload
%autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import numpy as np
import pickle
import pandas as pd
from scripts.feature_extractor import sent_to_features, sent_to_labels
from scripts.preprocessor import clean_arabic
import pickle
import csv
import os

In [3]:
feature_codes_1 = ['chr_position', 'minus5', 'minus4', 'minus3', 'minus2', 'minus1', 'focus',
                 'plus1', 'plus2', 'plus3', 'plus4', 'plus5', 'next2letters', 
                 'prev2letters', 'prev_word_suffix', 'following_word_prefix',
                 'focus_word_prefix', 'focus_word_suffix']

feature_codes_2 = ['minus5', 'minus4', 'minus3', 'minus2', 'minus1', 'focus',
                 'plus1', 'plus2', 'plus3', 'plus4', 'plus5',
                   'prev_word_minus1', 'prev_word_minus2', 'prev_word_minus3',
                  'following_word_plus0', 'following_word_plus1', 'following_word_plus2']

feature_codes = {'fc1': feature_codes_1, 'fc2': feature_codes_2}

In [8]:
def create_featureset(dataframe, feature_codes, out_file_path):
    """
    This functions takes in a tsv file with 4 columns: file, sentence_no, raw and seg
    and creates a featureset out of it. 
    """
    if feature_codes == 'fc1':
        feature_names = feature_codes_1
    elif feature_codes == 'fc2':
        feature_names = feature_codes_2
    else:
        raise Exception("Feature code not supported")
        
    with open(out_file_path, 'w') as out_file:
        fwriter = csv.writer(out_file, delimiter='\t')
        fwriter.writerow(["file", "sentence_no", "word_no", "word", "char"] + feature_names + ["char_label"])
        for i in range(len(dataframe)):
            raw_sent = dataframe['raw'][i]
            seg_sent = dataframe['seg'][i]
            sent_feats, sent_labels = sent_to_features(raw_sent, feature_names), sent_to_labels(seg_sent)
            for word_no, (word, word_feats, word_labels) in enumerate(zip(raw_sent.split(), sent_feats, sent_labels)):
                for char, char_feats, char_label in zip(word, word_feats, word_labels):
                    fwriter.writerow([dataframe['file'][i], i, word_no, word, char] + char_feats + [char_label])

In [10]:
def create_featureset(in_tsv_path, raw_col, seg_col, feature_code, out_file_path):
    """
    This function takes as input a tsv file which contains the raw and segmented form of sentence.
    raw_col is the name of the column containing raw sentences
    seg_col is the name of the column containing seg sentences
    feature_code = 'fc1' or 'fc2'
    The path of the output folder. It will append '_{feature_code}' to the input file name to produce
    the output file name
    """
    feature_names = feature_codes[feature_code]
    
    with open(in_tsv_path) as infile, open(out_file_path, 'w') as outfile:
        freader = csv.DictReader(infile, delimiter='\t')
        fwriter = csv.writer(outfile, delimiter='\t')
        fwriter.writerow(["file", "sentence_no", "word_no", "word", "char"] + feature_names + ["char_label"])
        for line_no, line in enumerate(freader):
            raw_sent = line[raw_col]
            sent_feats = sent_to_features(raw_sent, feature_names)
            if seg_col:
                seg_sent = line[seg_col]
                sent_labels = sent_to_labels(seg_sent)
            
            sent_feats, sent_labels = sent_to_features(raw_sent, feature_names), sent_to_labels(seg_sent)
            for word_no, (word, word_feats, word_labels) in enumerate(zip(raw_sent.split(), sent_feats, sent_labels)):
                for char, char_feats, char_label in zip(word, word_feats, word_labels):
                    fwriter.writerow([line['file'], line_no, word_no, word, char] + char_feats + [char_label])

In [39]:
"""
A new test set was provided by Emad which will help us compare our segmenter with Mada-Mira and Farasa.
Emad has results of running the above two segmenters on this test set.
Location: data/segmenter/raw/test2.txt (File 1)

More segmentation data was provided by Emad.
Location: data/segmenter/raw/all_my_segmentation_data.corrected (File 2)

Note: There is overlap between File 1 and File 2.

From the original Al-Mannar corpus, we have already carved out a train, dev and test.
The processed versions of these named train1.tsv, dev1.tsv and test1.tsv are stored in data/segmenter/raw

In this codeblock, we will find the overlap between the two files and add the non-overlapping sentences from 
File 2 to the original train and make a new train2.tsv. 
The original dev1.tsv will remain as is and we will have a new test set called test2.tsv

Eventually, we shall have the following:
train1.tsv, train2.tsv
dev1.tsv
test1.tsv, test2.tsv
"""
test2_set = set()
with open('data/segmenter/raw/test2.txt') as test2file:
    for line in test2file:
        test2_set.add(line.strip())
        
allseg_set = set()
with open('data/segmenter/raw/all_my_segmentation_data.corrected') as allsegfile:
    for line in allsegfile:
        allseg_set.add(line.strip())
        
with open('data/segmenter/raw/test2.tsv', 'w') as test2tsv:
    fwriter = csv.writer(test2tsv, delimiter='\t')
    fwriter.writerow(['file', 'sentence', 'raw', 'seg'])
    for sentence_no, seg in enumerate(test2_set):
        raw = ' '.join([''.join(word.split('+')) for word in seg.split(' ')])
        fwriter.writerow(['test2', sentence_no, raw, seg])

with open('data/segmenter/raw/train2.tsv', 'w') as train2tsv, open('data/segmenter/raw/train1.tsv') as train1tsv:
    fwriter = csv.writer(train2tsv, delimiter='\t')
    fwriter.writerow(['file', 'sentence', 'raw', 'seg'])
    for sentence_no, seg in enumerate((allseg_set - test2_set)):
        if seg:
            raw = ' '.join([''.join(word.split('+')) for word in seg.split(' ')])
            fwriter.writerow(['all_my_seg_data', sentence_no, raw, seg])
    
    freader = csv.reader(train1tsv, delimiter='\t')
    next(freader, None)
    for line in freader:
        fwriter.writerow(line)

In [43]:
"""
There were errors found in test1 or Manar Test. So Emad corrected them
and sent me a file called manar_corrected.test. Let us process it 
and save it in test3.test
"""
with open('data/segmenter/raw/test3.tsv', 'w') as test3tsv, \
        open('data/segmenter/raw/manar_corrected.test', 'r') as manar_test:
    
    fwriter = csv.writer(test3tsv, delimiter='\t')
    fwriter.writerow(['file', 'sentence', 'raw', 'seg'])
    for sentence_no, seg in enumerate(manar_test):
        seg = seg.strip()
        raw = ' '.join([''.join(word.split('+')) for word in seg.split(' ')])
        fwriter.writerow(['test2', sentence_no, raw, seg])

In [6]:
"""
The train2 created above apparently had sentences from ATB which were present in 
all_my_segmentation_data. So, we area creating a train3 by adding more stuff from 
a new folder called classical_train. ATB data has been removed from this.
"""
i = 0
files = os.listdir("data/segmenter/raw/classical_train/")
with open('data/segmenter/raw/train4.tsv', 'w') as train3tsv, open('data/segmenter/raw/train1.tsv') as train1tsv:
    fwriter = csv.writer(train3tsv, delimiter='\t')
    fwriter.writerow(['file', 'sentence', 'raw', 'seg'])
    for rfile_name in files:
        with open(os.path.join("data/segmenter/raw/classical_train/", rfile_name)) as rfile:
            for sentence_no, seg in enumerate(rfile):
                seg = seg.strip()
                if seg:
                    raw = ' '.join([''.join(word.split('+')) for word in seg.split(' ')])
                    fwriter.writerow([rfile_name, sentence_no, raw, seg])
                    i += 1
            print(i)
            input()
    freader = csv.reader(train1tsv, delimiter='\t')
    next(freader, None)
    for line in freader:
        fwriter.writerow(line)
        i += 1
    print(i)

48


 a


225


 a


285


 a


2443


 a


2471


 a


2509


 a


9172


In [9]:
"""
The following piece of code will call the create_featureset() function for dev/train/test sets.
You just need to change the `set_no` and the `feature_code`.
"""
# Set the following variables.
feature_code = 'fc1'
in_folder = 'data/segmenter/raw'
out_folder = 'data/segmenter/processed/'

# file_sets = ['dev', 'test', 'train']
# file_names = [i + str(set_no) + '.tsv' for i in file_sets]
# file_names = ['train1_sso.tsv', 'dev1_sso.tsv', 'test1_sso.tsv']
# file_names = [file for file in os.listdir(in_folder) if file.endswith('.tsv')]
file_names = ['train4.tsv']

for f in file_names:
    print('Working on {}'.format(f))
    frame = pd.read_csv(os.path.join(in_folder, f), delimiter='\t')
    outfilename = f.split('.')[0] + '_' + feature_code + '.tsv'
    create_featureset(frame, feature_code, os.path.join(out_folder, outfilename))

Working on train4.tsv


In [12]:
"""
The following code will convert the segmentation data to substandard orthography
The will enable us to study the effect of segmention on standard and substandard data

The files will be read from in_folder and writted into out_folder
"""

in_folder = 'data/segmenter/raw'
out_folder = 'data/combined/raw'
file_names = [file for file in os.listdir(in_folder) if file.endswith('.tsv')]
file_names = ['train4.tsv']
substandard_dict = {
    'أ': 'ا',
    'إ': 'ا',
    'آ': 'ا',
    'ة': 'ه',
}

def substandardize(somestring):
    out = [substandard_dict.get(letter, letter) for letter in somestring]
    return ''.join(out)

def replacey(word):
    if word.endswith('ي'):
        new_word = word[:-1] + 'ى'
    else:
        new_word = word
    return new_word


for file in file_names:
    outfilename = file.split('.')[0] + '_sso.tsv'
    with open(os.path.join(in_folder, file)) as infile, open(os.path.join(out_folder, outfilename), 'w') as outfile:
        freader = csv.DictReader(infile, delimiter='\t')
        fwriter = csv.writer(outfile, delimiter='\t')
        fwriter.writerow(['file', 'sentence', 'original_raw', 'original_seg', 'sso_raw', 'sso_seg'])
        for line in freader:
            raw_line = line['raw']
            raw_sso_line = " ".join([replacey(w) for w in substandardize(raw_line).split()])
            seg_line = line['seg']
            seg_sso_line = " ".join([replacey(w) for w in substandardize(seg_line).split()])
            fwriter.writerow([line['file'], line['sentence'], raw_line, seg_line, raw_sso_line, seg_sso_line])

In [13]:
"""
The following piece of code will call the create_featureset() function for SUBSTANDARD dev/train/test sets.
You just need to change the `set_no` and the `feature_code`.
"""
# Set the following variables.
feature_code = 'fc1'
in_folder = 'data/combined/raw'
out_folder = 'data/combined/processed/'
raw_col = 'sso_raw'
seg_col = 'sso_seg'

# file_sets = ['dev', 'test', 'train']
# file_names = [i + str(set_no) + '.tsv' for i in file_sets]
# file_names = ['train1_sso.tsv', 'dev1_sso.tsv', 'test1_sso.tsv']
# file_names = ['test3.tsv', 'train3.tsv']
# file_names = [file for file in os.listdir(in_folder) if file.endswith('.tsv')]
file_names = ['train4_sso.tsv']

for f in file_names:
    print('Working on {}'.format(f))
    out_file_name = f.split('.')[0] + '_' + feature_code + '.tsv'
    out_file_path = os.path.join(out_folder, out_file_name)
    create_featureset(os.path.join(in_folder, f), raw_col, seg_col, feature_code, out_file_path)

Working on train4_sso.tsv
