In [7]:
load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
%autoreload

In [94]:
import sys
import segmenter
import csv
from scripts.preprocessor import clean_arabic
import pandas as pd
from scripts.feature_extractor import char_to_features
from sklearn.model_selection import StratifiedShuffleSplit
from catboost import CatBoostClassifier
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import os
import gc
import lightgbm as lgb
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook as tqdm

In [20]:
substandard_dict = {
    'أ': 'ا',
    'إ': 'ا',
    'آ': 'ا',
    'ة': 'ه',
}

def substandardize(somestring):
    out = [substandard_dict.get(letter, letter) for letter in somestring]
    return ''.join(out)

def replacey(word):
    if word.endswith('ي'):
        new_word = word[:-1] + 'ى'
    else:
        new_word = word
    return new_word

In [5]:
feature_codes_1 = ['chr_position', 'minus5', 'minus4', 'minus3', 'minus2', 'minus1', 'focus',
                 'plus1', 'plus2', 'plus3', 'plus4', 'plus5', 'next2letters', 
                 'prev2letters', 'prev_word_suffix', 'following_word_prefix',
                 'focus_word_prefix', 'focus_word_suffix']

feature_codes_2 = ['minus5', 'minus4', 'minus3', 'minus2', 'minus1', 'focus',
                 'plus1', 'plus2', 'plus3', 'plus4', 'plus5',
                   'prev_word_minus1', 'prev_word_minus2', 'prev_word_minus3',
                  'following_word_plus0', 'following_word_plus1', 'following_word_plus2']

feature_codes = {'fc1': feature_codes_1, 'fc2': feature_codes_2}

In [110]:
# The following code converts a raw text file into a tsv
# The tsv has standard and substandard sentences side by side

#filename = "data/sso/islamweb_articles.txt"
filename = "data/sso/raw/siyar.txt"
outfilename = "data/sso/processed/" + os.path.split(filename)[1].split('.')[0] + '.tsv'

with open(outfilename, 'w') as outfile:
    fwriter = csv.writer(outfile, delimiter='\t')
    fwriter.writerow(["standard", "substandard"])
    infile = open(filename)
    for line in infile:
        line = clean_arabic(line)
        line = " ".join(line.strip().split())
        sso_line = " ".join([replacey(w) for w in substandardize(line).split()])
        if len(line) != len(sso_line):
            print(len(line), len(sso_line))
            raise Exception("Lengths are not equal")
        fwriter.writerow([line, sso_line])

In [None]:
"""
Convert the raw sso.tsv to a feature frame.
"""
raw_folder = "data/standardizer/raw"
processed_folder = "data/standardizer/processed"
fc = 'fc1'
files = ['albidya_walnihaya.tsv', 'islamweb_articles.tsv', 'islamweb_fatwa_answers.tsv', 'siyar.tsv']
symbols = set(('ا', 'ه'))


for file in files:
    print(file)
    infilepath = os.path.join(raw_folder, file)
    outfilepath = os.path.join(processed_folder, file.split('.')[0] + '_' + fc + '.tsv')
    inframe = pd.read_csv(infilepath, sep='\t')

    problem_lines = []

    with open(outfilepath, 'w') as outfile:
        fwriter = csv.writer(outfile, delimiter='\t')
        fwriter.writerow(['sentence_no', 'word_no', 'char_no'] + feature_codes[fc] + ['target'])
        for i, sent in enumerate(inframe['substandard']):
            try:
                sent = sent.split()
                for word_pos, word in enumerate(sent):
                    for char_pos, char in enumerate(word):
                        if char in symbols or (char_pos == len(word)-1 and char == 'ى'):
                            features = char_to_features(char_pos, word_pos, sent, feature_codes[fc])
                            target = [inframe['standard'][i].split()[word_pos][char_pos]]
                            fwriter.writerow([i, word_pos, char_pos] + features + target)
            except AttributeError:
                problem_lines.append(i)

    print(problem_lines)

In [4]:
# The following code is completely hardcoded and only merges the four files
# using FC1. Please be careful before using it for other featuresets.
# OUTPUT: data1_fc1.tsc inside data/standardizer/sets
# This is all 4 files put together
sso_fc1 = pd.read_csv("data/standardizer/processed/islamweb_fatwa_answers_fc1.tsv", sep='\t')
sso_fc2 = pd.read_csv("data/standardizer/processed/islamweb_articles_fc1.tsv", sep='\t')
sso_fc3 = pd.read_csv("data/standardizer/processed/siyar_fc1.tsv", sep='\t')
sso_fc4 = pd.read_csv("data/standardizer/processed/albidya_walnihaya_fc1.tsv", sep='\t')
data = pd.concat([sso_fc1, sso_fc2, sso_fc3, sso_fc4])
data.to_csv('data/standardizer/sets/data1_fc1.tsv', sep='\t')

In [3]:
# The following code random undersamples the majority class i.e. 'ا'
# down to 7 million samples and saves the new dataframe as 'data2'
data = pd.read_csv('data/standardizer/sets/data1_fc1.tsv', sep='\t')
alif_df = data[data['target']=='ا'].sample(n=7000000, random_state=42)
non_alif_df = data[data['target']!='ا']
data2 = pd.concat([alif_df, non_alif_df])
data2.to_csv('data/standardizer/sets/data2_fc1.tsv', sep='\t')
data = data2
gc.collect()

20

In [4]:
data['target'].value_counts()

ا    7000000
ه    6877584
أ    3847878
ة    2653253
ي    2310738
ى    1975402
إ    1612944
آ     330392
Name: target, dtype: int64

In [5]:
pd.crosstab(data['focus'], data['target'], rownames=['input'], colnames=['output'])

output,آ,أ,إ,ا,ة,ه,ى,ي
input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ا,330392,3847878,1612944,7000000,0,0,0,0
ه,0,0,0,0,2653253,6877584,0,0
ى,0,0,0,0,0,0,1975402,2310738


In [4]:
data['target'].value_counts(normalize=False)

ا    19846338
ه     6877584
أ     3847878
ة     2653253
ي     2310738
ى     1975402
إ     1612944
آ      330392
Name: target, dtype: int64

In [7]:
set_ratios = [(0.3, 0.5), (0.1, 0.5)]

data = pd.read_csv('data/standardizer/sets/data2_fc1.tsv', sep='\t')
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, temp_index in sss.split(data, data['target']):
    train = data.iloc[train_index]
    temp = data.iloc[temp_index]
    
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for dev_index, test_index in sss.split(temp, temp['target']):
    dev = temp.iloc[dev_index]
    test = temp.iloc[test_index]
    
train.to_csv('data/standardizer/sets/train1_fc1.tsv', sep='\t', index=True, header=True)
dev.to_csv('data/standardizer/sets/dev1_fc1.tsv', sep='\t', index=True, header=True)
test.to_csv('data/standardizer/sets/test1_fc1.tsv', sep='\t', index=True, header=True)

In [4]:
train = pd.read_csv('data/standardizer/sets/train1_fc1.tsv', sep='\t')
dev = pd.read_csv('data/standardizer/sets/dev1_fc1.tsv', sep='\t')
test = pd.read_csv('data/standardizer/sets/test1_fc1.tsv', sep='\t')
fc = 'fc1'

Xtrain, ytrain = train[feature_codes[fc]], train['target']
Xdev, ydev = dev[feature_codes[fc]], dev['target']
Xtest, ytest = test[feature_codes[fc]], test['target']
categorical_features_indices = np.where(Xtrain.dtypes != np.float)[0]

le = LabelEncoder()
le.fit(ytrain)
ytrain = le.transform(ytrain)
ydev = le.transform(ydev)
ytest = le.transform(ytest)
np.save('models/sso_encoder_classes.npy', le.classes_)

frames = [Xtrain, Xdev, Xtest]
for frame in frames:
    for c in frame.columns[1:]:
        frame[c] = frame[c].astype('category')
        
lgb_train = lgb.Dataset(Xtrain, ytrain)
lgb_eval = lgb.Dataset(Xdev, ydev, reference=lgb_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
print(le.classes_)
print(ytrain.shape, Xtrain.shape, Xdev.shape , ydev.shape, ytest.shape)

['آ' 'أ' 'إ' 'ا' 'ة' 'ه' 'ى' 'ي']
(18625733,) (18625733, 18) (3991229, 18) (3991229,) (3991229,)


In [None]:
params = {
    'objective': 'multiclass',
    'num_class': 8,
    'metric': 'multi_logloss',
    'train_metric': True,
    'boosting': 'gbdt',
    'num_leaves': 31,
    'num_threads': 7,
}

# model = lgb.train(params, 
#                 lgb_train,
#                 num_boost_round=1500,
#                 valid_sets=[lgb_train, lgb_eval],
#                 early_stopping_rounds=30)

model = lgb.cv(params,
               lgb_train,
               num_boost_round=1500,
               early_stopping_rounds=40
               #metrics=['multi_logloss', 'multi_error']
)


In [5]:
model = CatBoostClassifier(
    iterations=1000,
    thread_count=7,
    eval_metric=['TotalF1'],
    loss_function='MultiClass',
    logging_level='Verbose'
)

In [None]:
model.fit(
    Xtrain, ytrain,
    cat_features=categorical_features_indices,
    eval_set=(Xdev, ydev),
    plot=True
);

In [None]:
model.save_model("data/standardizer/models/cb_set1_1000.model")

In [7]:
model = lgb.Booster(model_file='models/lightgbm_sso_1.model')
pred_train = model.predict(Xtrain)
pred_dev = model.predict(Xdev)
pred_test = model.predict(Xtest)

preds_train = pred_train
preds_dev = pred_dev
preds_test = pred_test

pred_train, pred_dev, pred_test = [], [], []

preds1 = [preds_train, preds_dev, preds_test]
preds2 = [pred_train, pred_dev, pred_test]

for pred, predictions in zip(preds1, preds2):
    for x in pred:
        predictions.append(np.argmax(x))


print(
    metrics.accuracy_score(ytrain, pred_train),
    metrics.accuracy_score(ydev, pred_dev),
    metrics.accuracy_score(ytest, pred_test)
)

print(
    metrics.classification_report(ytrain, pred_train),
    metrics.classification_report(ydev, pred_dev),
    metrics.classification_report(ytest, pred_test)
)

print(
    metrics.confusion_matrix(ytrain, pred_train),
    metrics.confusion_matrix(ydev, pred_dev),
    metrics.confusion_matrix(ytest, pred_test),
)

0.9864751631519683 0.9860178406200195 0.9859607153586025
              precision    recall  f1-score   support

           0       0.99      0.96      0.97    231274
           1       0.98      0.98      0.98   2693514
           2       0.97      0.96      0.96   1129061
           3       1.00      1.00      1.00   4900000
           4       0.98      0.98      0.98   1857277
           5       0.99      0.99      0.99   4814309
           6       0.98      0.98      0.98   1382781
           7       0.98      0.99      0.98   1617517

   micro avg       0.99      0.99      0.99  18625733
   macro avg       0.98      0.98      0.98  18625733
weighted avg       0.99      0.99      0.99  18625733
               precision    recall  f1-score   support

           0       0.98      0.96      0.97     49559
           1       0.98      0.98      0.98    577182
           2       0.97      0.96      0.96    241941
           3       1.00      1.00      1.00   1050000
           4       0.

In [12]:
print(metrics.confusion_matrix(le.inverse_transform(ytrain), le.inverse_transform(pred_train), le.classes_))

[[ 222905    5856     808    1705       0       0       0       0]
 [   2296 2641452   31868   17898       0       0       0       0]
 [    572   41627 1084434    2428       0       0       0       0]
 [    469   14768    2243 4882520       0       0       0       0]
 [      0       0       0       0 1819398   37879       0       0]
 [      0       0       0       0   40116 4774193       0       0]
 [      0       0       0       0       0       0 1355231   27550]
 [      0       0       0       0       0       0   23827 1593690]]


In [15]:
pd.DataFrame(metrics.confusion_matrix(ytest, pred_test), columns=le.classes_, index=le.classes_)

Unnamed: 0,آ,أ,إ,ا,ة,ه,ى,ي
آ,47556,1427,223,353,0,0,0,0
أ,511,565760,6939,3972,0,0,0,0
إ,132,9117,232183,510,0,0,0,0
ا,104,3112,535,1046249,0,0,0,0
ة,0,0,0,0,389559,8429,0,0
ه,0,0,0,0,9046,1022591,0,0
ى,0,0,0,0,0,0,290181,6130
ي,0,0,0,0,0,0,5494,341116


### The followng code deals with standardizing non-standard segmenter data.

- `data/raw/test1.tsv` -> Standard Segmenter data. Al Mannar Corpus
- `data/raw/test1_sso.tsv` -> The above data which has been artificially non-standardized

Let us now standardize the artificially non-standardized data

In [7]:
"""
This converts the non-standardized segmentation test file into fc1 features
so that it can be artificially standardized again.
"""
filename = "data/raw/test1_sso.tsv"
fc = 'fc1'
outfilename = "data/sso/processed/segtest1_fc1.tsv"

symbols = set(('ا', 'ه'))
sso = pd.read_csv(filename, sep='\t')

problem_lines = []

with open(outfilename, 'w') as outfile:
    fwriter = csv.writer(outfile, delimiter='\t')
    fwriter.writerow(['sentence_no', 'word_no', 'char_no'] + feature_codes[fc])
    for i, sent in enumerate(sso['raw']):
        try:
            sent = sent.split()
            for word_pos, word in enumerate(sent):
                for char_pos, char in enumerate(word):
                    if char in symbols or (char_pos == len(word)-1 and char == 'ى'):
                        features = char_to_features(char_pos, word_pos, sent, feature_codes[fc])
                        #target = [sso['standard'][i].split()[word_pos][char_pos]]
                        fwriter.writerow([i, word_pos, char_pos] + features)
        except AttributeError:
            problem_lines.append(i)

print(problem_lines)

[]


Now there are three files namely:
1. `data/raw/test1.tsv` -> Standard Segmenter data. All Mannar Corpus
2. `data/raw/test1_sso.tsv` -> The above data which has been artificially non-standardized (We also call it **sso** in the code)
3. `data/sso/processed/segtest1_fc1.tsv` -> SSO features generated from lgb model from (2) [We also call it **segtest** in the code]
4. In the following code, our goal is create a new `overall_test.tsv` frame which we will call overall_test in the code. This frame will have the following columns:
    - `original_raw` (The original sentence from the Al-Mannar Corpus)
    - `original_seg` (Original gold annotations from Al-Mannar Corpus)
    - `sso_raw` (Artificially SSO'd sentences)
    - `sso_seg` (Artificially SSO'd annotations)
    - `sso_raw_standardized` (sso_raw sentences which have been standardized by the lgbm standardizer)
    - `sso_raw_standardized_segmented` (Output produced when Catboost segmenter is run on *sso_raw_standardized*)

In [6]:
# Load the label encoder and the lgbm model for sso
file = 'data/combined/processed/test1_sso_fc1.tsv'
segtest = pd.read_csv(file, sep='\t')
le = LabelEncoder()
le.classes_ = np.load('models/sso_encoder_classes.npy')
model = lgb.Booster(model_file='data/standardizer/models/lgbm_set1_1500.model')

# Make predictions using segtest. Add a new column to segtest called target 
segtestframe = segtest[feature_codes['fc1']]
for col in segtestframe.columns[1:]:
    segtestframe[col] = segtestframe[col].astype('category')
predictions = model.predict(segtestframe)
target = []
for pred in predictions:
    target.append(np.argmax(pred))
target = le.inverse_transform(target)
segtest['target'] = target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [109]:
# The following block of code will standardize substandard sentences.
# Our main targets are test1_sso.tsv and test2_sso.tsv. They are both 
# present in data/combined/raw. Their corresponding feature files are 
# have been created for the segmented and are present in data/combined/processed 
# Since, the segmenter and the standardizer both use the same feauture sets, 
# we can use it for out purposes.
def standardize(inframe, model, label_encoder, feature_code='fc1'):
    """
    inframe: The input feature frame
    Returns: A column consisting of standardized sentences.
    """
    segtestframe = inframe[feature_codes[feature_code]]
    for col in segtestframe.columns[1:]:
        segtestframe[col] = segtestframe[col].astype('category')
    predictions = model.predict(segtestframe)
    target = []
    for pred in predictions:
        target.append(np.argmax(pred))
    target = label_encoder.inverse_transform(target)
    inframe['predictions'] = target

    inframe['next_char_word_no'] = inframe['word_no'].shift(-1)
    inframe['replace_alif_ta'] = (inframe['focus'] == 'ه') | (inframe['focus'] == 'ا')
    inframe['last_char'] = inframe['word_no'] != inframe['next_char_word_no']
    inframe['replace_ya'] = inframe['last_char'] & (inframe['focus'] == 'ى')
    inframe['really_replace'] = inframe['replace_alif_ta'] | inframe['replace_ya']
    inframe['standardized_char'] = inframe['focus']
    inframe['standardized_char'][inframe['really_replace'] == True] = inframe['predictions'][inframe['really_replace'] == True]
    words = inframe.groupby(['file', 'sentence_no', 'word_no'])['standardized_char'].apply(lambda x: ''.join(list(x)))
    sentences = words.groupby(['file', 'sentence_no']).apply(lambda x: ' '.join(list(x)).strip())
    return sentences

le = LabelEncoder()
le.classes_ = np.load('models/sso_encoder_classes.npy')
model = lgb.Booster(model_file='data/standardizer/models/lgbm_set1_1500.model')
inframe = pd.read_csv('data/combined/processed/test1_sso_fc1.tsv', sep='\t')
test1std = standardize(inframe, model, le)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [119]:
test1std['file'] == 'P607'

KeyError: 'file'

In [111]:
test1base = pd.read_csv('data/combined/raw/test1_sso.tsv', sep='\t')
test1base['sso_raw_standardized'] = test1std.values
test1base.to_csv('data/combined/results/test1result.tsv', sep='\t')

In [98]:
test2base = pd.read_csv('data/combined/raw/test2_sso.tsv', sep='\t')
test2base['sso_raw_standardized'] = test2std.values
test2base.to_csv('data/combined/results/test2result.tsv', sep='\t')

In [115]:
test1base.shape

(1844, 7)

In [20]:
test1_orig = pd.read_csv('data/raw/test1.tsv', sep='\t')
overall_test = pd.DataFrame()

In [21]:
overall_test['file_name'] = sso['file']
overall_test['sent_no'] = sso['sentence']
overall_test['original_raw'] = test1_orig['raw']
overall_test['original_seg'] = test1_orig['seg']
overall_test['sso_raw'] = sso['raw']
overall_test['sso_seg'] = sso['seg']
overall_test['standardized_sso_raw'] = sso['raw_standardized']

In [24]:
overall_test.to_csv("data/overall_test.tsv", '\t')

In [None]:
def standardize(sent, le, model, fc):    
    symbols = set(('ا', 'ه'))
    sent = sent.split()
    standard_sent = []
    for word_pos, word in enumerate(sent):
        standard_word = ""
        for char_pos, char in enumerate(word):
            if char in symbols or (char_pos == len(word)-1 and char=='ى'):
                test_frame = pd.DataFrame(columns=feature_codes[fc])
                #test_frame['chr_position'] = test_frame['chr_position'].astype('int64')
                for col in test_frame.columns:
                    test_frame[col] = test_frame[col].astype(Xtrain[col].dtype)
                print(test_frame.dtypes)
                features = char_to_features(char_pos, word_pos, sent, feature_codes[fc])
                test_frame.loc[len(test_frame)] = features
                print(test_frame)
                # print(np.array(features).reshape(1, len(features)))
                target = np.argmax(model.predict(test_frame))
                target = le.inverse_transform(target)
            else:
                target = char
            standard_word += target
        standard_sent.append(word)
    return " ".join(standard_sent)
    
    
# The following block of code will standardize substandard sentences.
# Our main targets are test1_sso.tsv and test2_sso.tsv. They are both 
# present in data/combined/raw. Their corresponding feature files are 
# have been created for the segmented and are present in data/combined/processed 
# Since, the segmenter and the standardizer both use the same feauture sets, 
# we can use it for out purposes.
def standardize(inframe, model, label_encoder, feature_code='fc1'):
    """
    inframe: The input feature frame
    Returns: A column consisting of standardized sentences.
    """
    segtestframe = inframe[feature_codes[feature_code]]
    for col in segtestframe.columns[1:]:
        segtestframe[col] = segtestframe[col].astype('category')
    predictions = model.predict(segtestframe)
    target = []
    for pred in predictions:
        target.append(np.argmax(pred))
    target = label_encoder.inverse_transform(target)
    inframe['predictions'] = target

    inframe['sso_char'] = inframe['focus']
    print(inframe.shape)
    for i in tqdm(range(len(inframe) - 1)):
        if ((inframe['focus'][i] == 'ى' and inframe['word_no'][i] != inframe['word_no'][i+1])
            or inframe['focus'][i] == 'ه'
            or inframe['focus'][i] == 'ا'):
            inframe['sso_char'][i] = inframe['predictions'][i]
    words = inframe.groupby(['file', 'sentence_no', 'word_no'])['sso_char'].apply(lambda x: ''.join(list(x)))
    return words.groupby(['file', 'sentence_no']).apply(lambda x: ' '.join(list(x)).strip())

le = LabelEncoder()
le.classes_ = np.load('models/sso_encoder_classes.npy')
model = lgb.Booster(model_file='data/standardizer/models/lgbm_set1_1500.model')
inframe = pd.read_csv('data/combined/processed/test2_sso_fc1.tsv', sep='\t')
test2std = standardize(inframe, model, le)




sso = pd.read_csv('data/combined/raw/test1_sso.tsv', sep='\t')
sso['raw_standardized'] = sso['sso_raw']

for i in range(len(segtest)):
    sentence_no = segtest['sentence_no'][i]
    word_no = segtest['word_no'][i]
    char_no = segtest['chr_position'][i]
    target = segtest['target'][i]
    if segtest['focus'][i] != segtest['target'][i]:
        rs_sent = sso['raw_standardized'][sentence_no].split()
        rs_sent[word_no] = rs_sent[word_no][:char_no] + target + rs_sent[word_no][char_no+1:]
        sso['raw_standardized'][sentence_no] = " ".join(rs_sent)