Natural Language Processing သုတေသန လုပ်ကြတော့မယ်ဆိုရင် အရင်ဆုံး text file တွေကို cleaning လုပ်တာ၊ encoding ပြောင်းတာ၊ ရှိနေတဲ့ format ကို ကိုယ်လိုချင်တဲ့ ပုံစံဖြစ်အောင် ပြောင်းရတာ၊ လိုချင်တဲ့ စာလုံးတွေ၊ စာကြောင်းတွေကိုပဲ ဆွဲထုတ်ယူတာ စသည်ဖြင့် လုပ်ရတဲ့ အလုပ်တွေက အများကြီးပါပဲ။ Experiment တွေကို လုပ်ဖို့အတွက်က နေ့စဉ်လိုလို shell, perl (အခုနောက်ပိုင်းမှာတော့ python language) နဲ့ ပရိုဂရမ်တွေကို ရေးကြရပါတယ်။ တခါတလေမှာ format တစ်ခုကနေ နောက်တခြား format တစ်ခုကို ပြောင်းဖို့အတွက် ပရိုဂရမ်တပုဒ်ကို တရက်လုံးအချိန်ပေးပြီး ရေးလိုက်ရတာမျိုးလည်း ရှိပါတယ်။ အဲဒါကြောင့် အသုံးဝင်နိုင်မယ့် bash, perl, python ပရိုဂရမ်တွေကို ကျွန်တော် အချိန်ရရင်ရသလို တင်ပေးသွားပါမယ်။ တစ်ခုမှာချင်တာက ကျွန်တော်တင်ပေးထားတဲ့ ပရိုဂရမ်တွေကို အခြေခံပြီးတော့ shell, perl, python scripts တွေကို ကိုယ်တိုင်ရေးနိုင်အောင် ကြိုးစားကြပါ။
သုံးပုံသုံးနည်း အသေးစိတ်ကိုတော့ သက်ဆိုင်ရာ ဖိုလ်ဒါအသီးသီးမှာ ရှိတဲ့ example-usages.md (for bash, for perl, for python) ဖိုင်တွေကို မှီငြမ်းပါ။
ရဲကျော်သူ
- read-and-move.sh
- change-filenames.sh
- rm-date-sentences.sh
- print-classID-prediction-result.sh
- compare-img-or-pdf.sh
- chk-sort-by-columns.sh
- kill-all-detached.sh
- unzip-all-with-one-passwd.sh
- cut-filename.sh
- calc-avg.sh
- print-latex-section.sh
- list-mistake-5-suggestion.sh
- mytxt2pdf.sh
- prepare-open-test-data.sh
- print-CRLF.sh
- group-files.sh
- segmentation.sh
- split-even-odd-pdf.sh
- even-odd.sh
- rm-stopwords.sh
- rm-spaces-lineno.sh
- blowfish.sh
- replace-with-lineno.sh
- replace-with-lineno2.sh
- OOV-count.sh
- find-blank-lines.sh
- dot2png-pdf.sh
- add-start-end.sh
- get-words-with-position.sh
- count-string-length.sh
- strip-substring.sh
- chk_total_duration.sh
- print-sentenceID-count.sh
- mk-16KHz-mono.sh
- mk-spectrogram.sh
- group-UCF11.sh
- group-within-group-UCF11.sh
- dot2pic.sh
- 2mono-pdf.sh
- calc-bleu-all.sh
- calc-ribes-all.sh
- print-matched-x.sh
- split-train-dev-test.sh
- clean-space-all.sh
- mk-g2p-model.sh
- mk-syl-list.sh
- rm-200b-200d.sh
- print-char.sh
- prepare-10fold-smt-pair.sh
- rm-ctrl-m.sh
- x-letter-word.sh
- paste-column.sh
- lm-building-exec.sh
- print-most-common.sh
- calc-ppl-with-kenlm-query.sh
- mk-two-lm-and-merge.sh
- mk-class-lm.sh
- get-myPOS-tag.sh
- rm-myPOStags.sh
- print-same-col1.sh
- char-segmentation.sh
- chk-blank-fields.sh
- chk-field-length.sh
- crop-pdf.sh
- excel2csv-chk-fields.sh
- change-format.sh
- format-mecab-pos.sh
- cp-config.sh
- DELETE-ALL.sh
- trim-silence.sh
- wav2wavform.sh
- mytext2pic.sh
- formula-pic.sh
- rm-heading-tab-lineno.sh
- mk-10cross-data.sh
- align-GIZA++.sh
- date-time-info.sh
- mp4-to-wav.sh
- my-font-chk.sh
- rec-recorder.sh
- mp42gif.sh
- extract-target-text.sh
- txt2png.sh
- pic2histogram.sh
- tesseract-ocr.sh
- sylbreak-10fold-mt.sh
- syllable-break-multi-files.sh
- build-fastalign-pt.sh
- txt2ASL-BSL.sh
- mgiza-align.sh
- add-dummy-word-mk-csv.sh
- kidbright-burmese-transcription.sh
- count-csv-fields.sh
- sylbreak-gui.sh
- espeak-and-zenity.sh
- find-edit-gui.sh
- sqlite3-gui.sh
- mk-background-transparent.sh
- spelling-checker-with-dict.sh
- chop-by-silence.sh
- random-no.sh
- sort-capitalized-letter-first.sh
- chk-wavefile-duration-for-unicode-filename.sh
- calc-chrF.sh
- check-end-mark.sh
- word2pdf.sh
- nllb-translate.sh
- install_python.sh
- extract_key_value_of_json.sh
- clean-space.pl
- rm-EnglishSentences.pl
- word-analysis.pl
- print-emojiSentences.pl
- dq-multilines.pl
- mk-abstract-para.pl
- print-mySentenceOnly.pl
- rm-symbol-and-myVowel-only-sentences.pl
- rm-space-btw-numbers.pl
- print-ngram.pl
- print-codepoint.pl
- wc.pl
- wordlimit.pl
- wordwrap.pl
- get-syl-potma.pl
- my-linebreak.pl
- rm-ne-tag.pl
- clean-v-without-c.pl
- x-x-to-x-comma-x-with-brackets.pl
- select-en-th-my.pl
- mk-speakers-json.pl
- string-distance.pl
- print-matched-char-seq.pl
- search-common.pl
- fixed-parallel-order.pl
- encode-input.pl
- decode.pl
- mk-one2one-freq.pl
- mk-one-syl-confusion.pl
- rm-onechar-line.pl
- replace-with-lineno.pl
- chk-pos-tags.pl
- count-string-length.pl
- print-diff-word.pl
- print-union-isect-diff.pl
- print-common-kachin.pl
- sylbreak.pm
- test.sylbreak.pm.pl
- tag-BI.pl
- bigram-similarity.pl
- chk-src-trg-words.pl
- print-my-numeric-sentence.pl
- number-punct-segmentation.pl
- tabpair-to-crfcol.pl
- print-blank-lines.pl
- add-spu_id.pl
- human-mt-eval-form.pl
- trainTuneScore_jamy.pl
- rm-blank-line.pl
- gizaA3-4human.pl
- print-fngram-format.pl
- print-myWordOnly.pl
- fastalign-4human.pl
- find-one-file-words-in-another.pl
- mypos2json.pl
- roman2myno.pl
- bracket-tree2sentence.pl
- clean-punctuation.pl
- mk-spelling-dict.pl
- remove-one-char-lines.pl
- clean-brackets-tags.pl
- check-empty-field.pl
- eng-sentence-split.pl
- chk-token.py
- numpy-array-element-compare.py
- char-count-element-wise.py
- char-startswith-element-wise.py
- fuzzy-match.py
- hex2uni.py
- korean-breaks.py
- epitranscribe.py
- plot-unicode-char.py
- en-sentence-tokenizer.py
- en-word-tokenizer.py
- en-tokenization-on-punctuation.py
- filter-en-stopwords.py
- mk-QR-code.py
- wu-palmer-similarity.py
- nltk-en-pos-tagger.py
- folder-file-dict.py
- csv-str2mapping123.py
- str2mapping123.py
- str2my-edit-distances.py
- mypos2upos.py
- isolation-forest.py
- accuracy.py
- how-name-eq-main-work.py
- f1-score-calc.py
- multi-class-f1.py
- language-detect.py
- python-list-eg.py
- split-train-test.py
- split-train-valid-test.py
- add-sign.py
- add-sign-onepage-pdf.py
- print-img-resolution.py
- print-pixel-value.py
- RGB2grey.py
- image2npy.py
- syl2freq.py
- syl2tf.py
- syl2idf.py
- syl2tf-idf.py
- syl2onehot-sklearn-4teaching.py
- syl2onehot-sklearn.py
- zawgyi2unicode.py
- zawgyi2unicode-syl.py
- word2vec.py
- make-edit-error.py
- 8eval.py
- soundex-metaphone.py
- 7sim.py
- abugida.py
- tex-spellcheck.py
- video_augment.py
- mk-video-class.py
- mk-video-class-for-sentence.py
- m4v_to_mp4.py
- mov_to_mp4.py
- jiwer_wer_mer_wil.py
- passphrase_generator.py
- rule_based_password_gen.py
- MOS_eval.py
- spacy_pos_ner.py
- spacy_pos_dep_jp.py
- spacy_pos_ner_dep_zh.py
- nltk-lm.py
- nltk-lm-predict.py
- format_conversion.py
- format_conversion_with_error_check.py
- cut_columns.py
- bidirectional_maximum_matching.py
- extract_filename_parts.py
- sort_openslr_transcript.py
- speech_corpus_info.py
- dKNN.py
- dKNN-ver2.py
- change_sampling_rate.py
- check_silence.py
- graph_lm_spellchek.py
- detect_language_ver1.py
- detect_language_ver2.py
- embedder.py
- test_embedding.py
- convert_to_conllu.py
- convert_to_spacyNER_json.py
- split_parallel_data.py
- clean_text.py
- extract_emoji.py
- compare_characters.py
- word_length_analysis.py
- comma2tab_label2digit.py
- conv_delimiter_label2digit.py
- padsint_detection.py
- replace_pipe_with_space.py
- pos_pattern_checker.py
- sort_ngram.py
- analyze_NER_corpus.py
- compare_sentence_tag_distributions.py
- compare_word_tag_distributions.py
- print_codepoint.py
- syl_ngram_mi.py
- txt_dl.py
- markov_txt_gen.py
- tesseract_ocr.py
- NER_23to9_conv.py
- tf_event2txt.py
- hangul_syl_generator.py
- ngram_segmentation.py
- long_sentence_wrapper.py
- mm_proverb_parser.py
- grapheme_tokenizer.py
- icu_collation.py
- icu_transliteration.py
- my_transliteration.py
- kana2roman.py
- prefix_suffix_extract.py
- mk_only_my.py
- rm_my_two_symbols.py
- char_segmentation.py
- fasttext_format_converter.py
- run_sylbreak.py
- rm_zwnj_zwsp_hsp.py
- clean_non_burmese.py
- eval_ngram_lm.py
- parquet_extractor.py
- g2p-compare.py
- extract-ReMeDi.py
- split-sentences-by-pipe.py
- format-conv.py
- wtc-paste.py
- cv-split.py
- mk_hatespeech_dict.py
- train_embedding.py
- convert_to_two_words_dict.py
- emoji_count.py
- rm_blank_line.py
- my_no_spacing.py
- punc_emoji_spacing.py
- check_tag_distribution.py
- check_corpus_tag_distribution.py
- utf8_file_char_counter.py
- word_shuffle.py
- chk-syspath.py