-
Notifications
You must be signed in to change notification settings - Fork 1
/
lang_identification_train.py
153 lines (117 loc) · 5.42 KB
/
lang_identification_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# This file is part of Idiomata Cognitor.
#
# Idiomata Cognitor is free software: you can redistribute it and/or modify
# it under the terms of the Apache-2.0 License as published by
# the Apache Software Foundation.
#
# Idiomata Cognitor is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Apache-2.0 License for more details.
#
# You should have received a copy of the Apache-2.0 License
# along with Idiomata Cognitor. If not, see <http://www.apache.org/licenses/>.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd
import joblib
import argparse
labels = {
'spa': 'Spanish',
'cat': 'Catalan',
'arg': 'Aragonese',
'arn': 'Aranese',
'oci': 'Occitan',
'ast': 'Asturian',
'glg': 'Galician',
'ita': 'Italian',
'fra': 'French',
'por': 'Portuguese'
}
"""
Max length of training corpus per language
Ensure each language has similar amount of representation (Balanced Dataset)
"""
MAX_LENGTH = 10000
def read_languages_data(path):
with open(path) as f:
language_transcription = f.readlines()
language_transcription = language_transcription[:MAX_LENGTH]
return language_transcription
def combine_language_data(sentences, language_index):
sentences = np.array(sentences)
sentences = sentences.reshape(sentences.shape[0],1)
target = np.zeros((sentences.shape[0],1))
target += language_index
language_data = np.hstack((sentences, target))
return language_data
def shuffle_rows(languages):
index = np.arange(0, len(languages))
np.random.shuffle(index)
shuffled_languages = languages[index,:]
return shuffled_languages
"""
Run all data preprocessing helper functions
"""
def preproccess_raw_data(file_paths):
# Read all raw text data from file paths
language_transcriptions = [ read_languages_data(path) for path in file_paths ]
# Combine each language with its language_index
languages = [ combine_language_data(sentences,i+1) for i,sentences in enumerate(language_transcriptions) ]
# Vertically stack all data into one 2D np.array
languages = np.vstack((languages))
# Shuffle languages by row
languages = shuffle_rows(languages)
return languages
def main() -> None:
parser = argparse.ArgumentParser(description="Train Romance language identifier from monolingual corpora")
parser.add_argument("--spa", type=str, required=True, help="Spanish training corpus")
parser.add_argument("--cat", type=str, required=True, help="Catalan training corpus")
parser.add_argument("--arg", type=str, required=True, help="Aragonese training corpus")
parser.add_argument("--arn", type=str, required=True, help="Aranese training corpus")
parser.add_argument("--oci", type=str, required=True, help="Occitan training corpus")
parser.add_argument("--ast", type=str, required=True, help="Asturian training corpus")
parser.add_argument("--ita", type=str, required=True, help="Galician training corpus")
parser.add_argument("--glg", type=str, required=True, help="Italian training corpus")
parser.add_argument("--fra", type=str, required=True, help="French training corpus")
parser.add_argument("--por", type=str, required=True, help="Portuguese training corpus")
parser.add_argument("--output-model", type=str, required=True, help="The trained model path")
args = parser.parse_args()
# Get all file paths
file_paths = [getattr(args, language) for language in labels]
# Preprocess all raw text into a form suitable for TfidfVectorizer
languages = preproccess_raw_data(file_paths)
df_languages = pd.DataFrame(languages)
df_languages.columns = ['natural language', 'language index']
df_languages['language index'] = df_languages['language index'].apply(float)
df_languages['language'] = df_languages['language index'].map(labels)
df_languages.shape
# Split data into raw features and labels
language_features = df_languages['natural language']
language_targets = df_languages['language index']
unique, counts = np.unique(language_targets, return_counts=True)
print(dict(zip(unique, counts)))
# Split data into training and test set
# Train on 70% of data, Test on remaining 30%
X_train, X_test, y_train, y_test = train_test_split(language_features,
language_targets,
test_size = 0.3,
random_state = 42)
# Make Pipeline with TfidfVectorizer and MultinomialNB
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(1,5))
model = MultinomialNB()
text_clf = Pipeline([('tfidf', tfidf_vect), ('clf', model)])
# Train model with pipeline classifier
text_clf.fit(X_train, y_train)
# save
joblib.dump(text_clf, args.output_model)
# Measure accuracy
predictions = text_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test,predictions)}")
print(classification_report(y_test, predictions, target_names=labels.values()))
if __name__ == "__main__":
main()