In [1]:
import sys
import csv
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from catboost import CatBoostClassifier
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import os
import gc
import lightgbm as lgb
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import accuracy_score
from sklearn_crfsuite.metrics import sequence_accuracy_score

from scripts.feature_extractor import char_to_features, sent_to_features
from scripts.preprocessor import clean_arabic
from scripts.postprocessor import get_human_readable_segmentation, _humanify_sentence, _humanify_word


feature_codes_1 = ['chr_position', 'minus5', 'minus4', 'minus3', 'minus2', 'minus1', 'focus',
                 'plus1', 'plus2', 'plus3', 'plus4', 'plus5', 'next2letters', 
                 'prev2letters', 'prev_word_suffix', 'following_word_prefix',
                 'focus_word_prefix', 'focus_word_suffix']

feature_codes_2 = ['minus5', 'minus4', 'minus3', 'minus2', 'minus1', 'focus',
                 'plus1', 'plus2', 'plus3', 'plus4', 'plus5',
                   'prev_word_minus1', 'prev_word_minus2', 'prev_word_minus3',
                  'following_word_plus0', 'following_word_plus1', 'following_word_plus2']

feature_codes = {'fc1': feature_codes_1, 'fc2': feature_codes_2}

In [2]:
def create_featureset(in_tsv_path, raw_col, seg_col, feature_code, out_file_path):
    """
    This function takes as input a tsv file which contains the raw and segmented form of sentence.
    raw_col is the name of the column containing raw sentences
    seg_col is the name of the column containing seg sentences
    feature_code = 'fc1' or 'fc2'
    The path of the output folder. It will append '_{feature_code}' to the input file name to produce
    the output file name
    """
    feature_names = feature_codes[feature_code]
    
    with open(in_tsv_path) as infile, open(out_file_path, 'w') as outfile:
        freader = csv.DictReader(infile, delimiter='\t')
        fwriter = csv.writer(outfile, delimiter='\t')
        fwriter.writerow(["file", "sentence_no", "word_no", "word", "char"] + feature_names + ["char_label"])
        for line_no, line in enumerate(freader):
            raw_sent = line[raw_col]
            sent_feats = sent_to_features(raw_sent, feature_names)
            if seg_col:
                seg_sent = line[seg_col]
                sent_labels = sent_to_labels(seg_sent)
            
            sent_feats, sent_labels = sent_to_features(raw_sent, feature_names), sent_to_labels(seg_sent)
            for word_no, (word, word_feats, word_labels) in enumerate(zip(raw_sent.split(), sent_feats, sent_labels)):
                for char, char_feats, char_label in zip(word, word_feats, word_labels):
                    fwriter.writerow([line['file'], line_no, word_no, word, char] + char_feats + [char_label])

In [3]:
def create_prediction_features(sentences, feature_code):
    feature_names = feature_codes[feature_code]
    features = []
    for sentence_no, sentence in enumerate(sentences):
        sent_feats =sent_to_features(sentence, feature_names)
        for word_no, (word, word_feats)  in enumerate(zip(sentence.split(), sent_feats)):
            for char, char_feats in zip(word, word_feats):
                features.append([sentence_no, word_no] + char_feats)
    columns = ['sentence_no', 'word_no'] + feature_names
    feature_frame = pd.DataFrame(features, columns=columns)
    return feature_frame

def segment(sentences, model, fc):
    features = create_prediction_features(sentences, fc)
    features['predictions'] = model.predict(features[feature_codes[fc]]).astype(int)
    breakpoints = features.groupby(['sentence_no', 'word_no'])['predictions'].apply(list)
    segmented_sentences = get_human_readable_segmentation(sentences, breakpoints)
    return segmented_sentences

In [6]:
data1 = pd.read_csv("data/combined/results/ManarCorrected.result", sep='\t')
data2 = pd.read_csv("data/combined/results/Classical.result", sep='\t')
datas = [data1, data2]
model_folder = 'data/segmenter/models'
models = {'ManarPlusClassical': 'catboost_fc1_1.model',
             'ManarPlusClassicalSubstandard': 'catboost_fc1_1_sso.model',
             'ManarOnly': 'catboost_fc1_ManarOnly.model',
             'ManarOnlySubstandard': 'catboost_fc1_ManarOnly_sso.model'}

models = {'ManarPlusClassical': 'catboost_fc1_Train4.model'}
raw_columns = ['original_raw', 'sso_raw', 'sso_raw_standardized']

data = data2
raw_columns = ['original_raw', 'sso_raw', 'sso_raw_standardized']
for col in raw_columns:
    for model_name in models:
        new_col = col + '_' + model_name + '_segmented'
        model = CatBoostClassifier()
        model.load_model(os.path.join(model_folder, models[model_name]))
        data[new_col] = segment(data[col], model, 'fc1')
        
data.to_csv('data/combined/results/test2result4.tsv', sep='\t')

In [15]:
data = pd.read_csv("data/combined/results/test1result.tsv", sep='\t')
model = CatBoostClassifier()
model.load_model("data/segmenter/models/catboost_fc1_1.model")

<catboost.core.CatBoostClassifier at 0x7f1064bdc908>

In [16]:
raw_columns = ['original_raw', 'sso_raw', 'sso_raw_standardized']
for col in raw_columns:
    new_col = col + '_segmented'
    data[new_col] = segment(data[col], model, 'fc1')

In [17]:
data.to_csv("data/combined/results/test1result_updated.tsv", sep='\t')

In [None]:
data.to_csv('data/combined/results/Classical.result', sep='\t', index=False)
data.to_csv('data/combined/results/Manar.result', sep='\t', index=False)