Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

add stemming algorithm, not well tested

  • Loading branch information...
commit 3b9bf068673492cd3431a72de1f37aeb0745aaad 1 parent 373750e
Dmitry authored
18 Samples/Program.fs
... ... @@ -1,9 +1,9 @@
1 1 module Samples
2 2
3 3 //open TinyNLP.Tokenizer
4   -open TinyNLP.Synonyms
  4 +open TinyNLP.Synonymizer
5 5
6   -let input_string = "Я хочу сказать что-нибудь о Рио-де-Жанейро и о г.Бердске"
  6 +let input_string = "Я хочу сказать что-нибудь прекрасное и возвышенное как о Рио-де-Жанейро так и о г. Бердске"
7 7
8 8 let duration f =
9 9 let timer = new System.Diagnostics.Stopwatch()
@@ -12,13 +12,17 @@ let duration f =
12 12 printfn "%A Ellapsed Time: %f ms" (f.GetType().ToString()) timer.Elapsed.TotalMilliseconds
13 13 returnValue
14 14
15   -
  15 +let stemSample input =
  16 + let r = TinyNLP.Tokenizer.tokenize input_string
  17 + match r with
  18 + | None -> ["Error"]
  19 + | _ -> r.Value |> List.map (fun x -> TinyNLP.Stemming.Stem x)
16 20
17 21 let sampleTinyNLPSynonyms =
18 22 let word1 = "слово"
19 23 let word2 = "дело"
20   - printfn "synonyms for %A is %A" word1 (getSynonyms word1)
21   - printfn "synonyms for %A is %A" word2 (getSynonyms word2)
  24 + printfn "synonyms for %A is %A" word1 (getSynonyms (TinyNLP.Stemming.Stem word1))
  25 + printfn "synonyms for %A is %A" word2 (getSynonyms (TinyNLP.Stemming.Stem word2))
22 26
23 27 let tokenizeSample =
24 28 let r = TinyNLP.Tokenizer.tokenize input_string
@@ -28,4 +32,6 @@ let tokenizeSample =
28 32
29 33 duration (fun () -> sampleTinyNLPSynonyms) |> ignore
30 34
31   -printfn "%A" (duration (fun () -> tokenizeSample))
  35 +printfn "token %A" (duration (fun () -> tokenizeSample))
  36 +
  37 +printfn "stem %A" (duration (fun () -> stemSample input_string))
3  Samples/bin/Debug/Samples.XML
@@ -14,6 +14,9 @@
14 14 <member name="">
15 15
16 16 </member>
  17 +<member name="">
  18 +
  19 +</member>
17 20 <member name="T:Samples">
18 21
19 22 </member>
BIN  Samples/bin/Debug/Samples.exe
Binary file not shown
BIN  Samples/bin/Debug/Samples.pdb
Binary file not shown
BIN  Samples/bin/Debug/TinyNLP.fs.dll
Binary file not shown
BIN  Samples/bin/Debug/TinyNLP.fs.pdb
Binary file not shown
71 Samples/bin/Debug/TinyNLP.fs.xml
@@ -68,6 +68,75 @@
68 68 <member name="">
69 69
70 70 </member>
  71 +<member name="T:TinyNLP.Stemming">
  72 +
  73 +</member>
  74 +<member name="">
  75 +
  76 +</member>
  77 +<member name="">
  78 +
  79 +</member>
  80 +<member name="">
  81 +
  82 +</member>
  83 +<member name="">
  84 +
  85 +</member>
  86 +<member name="">
  87 +
  88 +</member>
  89 +<member name="">
  90 +
  91 +</member>
  92 +<member name="">
  93 +
  94 +</member>
  95 +<member name="">
  96 +
  97 +</member>
  98 +<member name="">
  99 +
  100 +</member>
  101 +<member name="">
  102 +
  103 +</member>
  104 +<member name="">
  105 +
  106 +</member>
  107 +<member name="">
  108 +
  109 +</member>
  110 +<member name="">
  111 +
  112 +</member>
  113 +<member name="">
  114 +
  115 +</member>
  116 +<member name="">
  117 +
  118 +</member>
  119 +<member name="">
  120 +
  121 +</member>
  122 +<member name="">
  123 +
  124 +</member>
  125 +<member name="">
  126 +
  127 +</member>
  128 +<member name="">
  129 +
  130 +</member>
  131 +<member name="">
  132 +
  133 +</member>
  134 +<member name="">
  135 +
  136 +</member>
  137 +<member name="">
  138 +
  139 +</member>
71 140 <member name="">
72 141
73 142 </member>
@@ -98,7 +167,7 @@
98 167 <member name="">
99 168
100 169 </member>
101   -<member name="T:TinyNLP.Synonyms">
  170 +<member name="T:TinyNLP.Synonymizer">
102 171
103 172 </member>
104 173 <member name="">
BIN  Samples/obj/Debug/Samples.exe
Binary file not shown
BIN  Samples/obj/Debug/Samples.pdb
Binary file not shown
BIN  Samples/obj/Debug/TinyNLP.Samples.fsprojResolveAssemblyReference.cache
Binary file not shown
98 Stemming.fs
... ... @@ -0,0 +1,98 @@
  1 +module TinyNLP.Stemming
  2 +
  3 +open System;
  4 +open System.Collections.Generic;
  5 +open System.Linq;
  6 +open System.Text;
  7 +open System.Text.RegularExpressions;
  8 +
  9 +let c_vower = "аеиоуыэюя"
  10 +let c_perfectiveground = "((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$"
  11 +let c_reflexive = "(с[яь])$"
  12 +let c_adjective = "(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|еых|ую|юю|ая|яя|ою|ею)$"
  13 +let c_participle = "((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$"
  14 +let c_verb = "((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$"
  15 +let c_noun = "(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|и|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$"
  16 +let c_rvre = "^(.*?[аеиоуыэюя])(.*)$"
  17 +let c_derivational = "[^аеиоуыэюя][аеиоуыэюя]+[^аеиоуыэюя]+[аеиоуыэюя].*(?<=о)сть?$"
  18 +let c_eng = "[a-z0-9]"
  19 +let c_i = "и$"
  20 +let c_ost = "ость?$"
  21 +let empty_string = String.Empty
  22 +
  23 +let RegexReplace (original:string, regx:string, value:string) =
  24 + let reg = new Regex(regx)
  25 + let n = reg.Replace(original, value)
  26 + (original.Equals(n), n)
  27 +
  28 +let RegexMatch (original:string, regx:string) =
  29 + let reg = new Regex(regx)
  30 + reg.Match(original)
  31 +
  32 +let RegexMatches (original:string, regx:string) =
  33 + let reg = new Regex(regx, RegexOptions.Multiline);
  34 + reg.Matches(original)
  35 +
  36 +let inline isEnd word =
  37 + let matches = RegexMatches(word, c_rvre)
  38 + matches.Count < 1
  39 +
  40 +let step1 word =
  41 + let a = RegexReplace (word, c_perfectiveground, empty_string)
  42 + match a with
  43 + | (false, w) -> w
  44 + | (true, w) ->
  45 + let b1 = RegexReplace (w, c_reflexive, empty_string)
  46 + let b2 = RegexReplace(snd(b1), c_adjective, empty_string)
  47 + match b2 with
  48 + | (false, w1) -> snd(RegexReplace(w1, c_participle, empty_string))
  49 + | (true, w1) ->
  50 + match RegexReplace(w1, c_verb, empty_string) with
  51 + | (true, w2) -> snd(RegexReplace(w2, c_noun, empty_string))
  52 + | (false, w2) -> w2
  53 +
  54 +let step2 word =
  55 + snd(RegexReplace(word, c_i, empty_string))
  56 +
  57 +let step3 word =
  58 + let m = RegexMatch(word, c_derivational)
  59 + match m.Success with
  60 + | true -> snd(RegexReplace(word, c_ost, empty_string))
  61 + | _ -> word
  62 +
  63 +let step4 word =
  64 + let m = RegexReplace(word, "ь$", empty_string)
  65 + match m with
  66 + | (false, w) -> w
  67 + | (true, w) -> snd(RegexReplace(snd(RegexReplace(w, "ейше?", empty_string)), "нн$", "н"))
  68 +
  69 +let Stem (inword:string) =
  70 + //let lowword = word.ToLower().Trim().Replace("ё", "е")
  71 + let word = inword.ToLower().Trim().Replace("ё", "е");
  72 + // let rec instem word =
  73 + // printfn "%A" word
  74 + match isEnd word with
  75 + | true -> word
  76 + | false -> (RegexMatches(word, c_rvre).Item 0).Value |> step1 |> step2 |> step3 |> step4 //|> instem
  77 + // instem lowword
  78 +
  79 +
  80 +
  81 +//
  82 +//
  83 +// // шаг 4
  84 +// if (!RegexReplace(ref rv, "ь$", string.Empty))
  85 +// {
  86 +// RegexReplace(ref rv, "ейше?", string.Empty);
  87 +// RegexReplace(ref rv, "нн$", "н");
  88 +// }
  89 +//
  90 +// value = rv;
  91 +//
  92 +// } while (false);
  93 +//
  94 +// return value;
  95 +// }
  96 +//
  97 +// }
  98 +//}
40 Synonymizer.fs
... ... @@ -0,0 +1,40 @@
  1 +module TinyNLP.Synonymizer
  2 +
  3 +open ProtoBuf
  4 +open Kevo.Store
  5 +
  6 +type LexicalClass =
  7 + | Noun = 1 // существительное
  8 + | Verb = 2 // глагол
  9 + | Adverb = 3 // наречие
  10 + | Adjective = 4 // прилагательное
  11 + | Prepositions = 5 // предлоги
  12 + | Others = 10
  13 +
  14 +let empty_string = ""
  15 +
  16 +[<ProtoContract(ImplicitFields = ImplicitFields.AllPublic)>]
  17 +type WordItem (word : string, wordst : string, suff : string, part : LexicalClass, syn : int array, prefix : string) = class
  18 + member val Word : string = word with get, set
  19 + member val Wordst : string = wordst with get, set
  20 + member val Suff : string = suff with get, set
  21 + member val Part : LexicalClass = part with get, set
  22 + member val Syn : int array = syn with get, set
  23 + member val Prefix : string = prefix with get, set
  24 + new() = WordItem(empty_string, empty_string, empty_string, LexicalClass.Others, [||], empty_string)
  25 + override x.ToString() = x.Word
  26 + end
  27 +
  28 +
  29 +
  30 + let getSynonyms for_word =
  31 + let query (x:WordItem) =
  32 + x.Wordst = for_word
  33 + let getWord id =
  34 + let witem = Kevo.Store.findById<WordItem> id
  35 + match witem with
  36 + | None -> empty_string
  37 + | _ -> witem.Value.Word
  38 + let getRelations (word_ids: int array) =
  39 + word_ids |> Array.map (fun x -> getWord x) |> List.ofArray
  40 + Kevo.Store.findByQuery<WordItem> query |> List.collect (fun x -> getRelations x.Syn)
3  TinyNLP.fs.fsproj
@@ -55,7 +55,8 @@
55 55 <ItemGroup>
56 56 <Compile Include="NLPCore.fs" />
57 57 <Compile Include="Tokenizer.fs" />
58   - <Compile Include="Synonyms.fs" />
  58 + <Compile Include="Stemming.fs" />
  59 + <Compile Include="Synonymizer.fs" />
59 60 <None Include="packages.config" />
60 61 </ItemGroup>
61 62 <PropertyGroup>

0 comments on commit 3b9bf06

Please sign in to comment.
Something went wrong with that request. Please try again.