Simply put predict next word user will write.
git clone git@github.com:syzer/distributedNgram.git && cd $_
npm install
npm install --save-dev
The file nGram.js offers more compact version of code:
npm start
var jsSpark = require('js-spark')({workers: 16});
var task = jsSpark.jsSpark;
var q = jsSpark.q;
task([20, 30, 40, 50])
// this is executed on client side
.map(function addOne(num) {
return num + 1;
})
.reduce(function sumUp(sum, num) {
return sum + num;
})
.run()
.then(function(data) {
// this is executed on back on server
console.log('i finished calculating', data);
})
npm test
clone https://github.com/syzer/distributedNgram.git
load:
-
dracula
-
lodash
-
load helpers
(gist)
make function prepare()
// remove special characters
function prepare(str){}
prepare('“Listen to them, the children of the night. What music they make!”')
//=>"listen to them the children of the night what music they make"
(gist)
make bigramText()
bigramText("to listen to them the children of the night what music they make");
//=>{to: {listen: 1, them:1} , listen:{to:1}, the:{children:1}}...
function bigramText(str) {
return arr.reduce(bigramArray);
}
(gist)
function mergeSmall()
-
create 2 tasks ch01, and ch02
-
use tasks to bigram those chapters
-
reduce response with _.merge
(gist)
function mergeBig(texts)
-
load [ch1, ch2, ch3] or texts
-
make distinct tasks to bigram this text
-
reduce with _.mergeObjectsInArr
-
cache result
-
return result
(gist)
function predict(word)
-
load appropriate key/word from cache
-
calc total hits
-
sort all hits in order,
may use helper function objToSortedArr(obj)
- calc frequency/probability of next word
(gist)
function train(fileName, splitter)
-
load file
-
prepare
-
use splitter(string) to create separate tasks
-
calculate tasks on clients using mergeBig()
[ ] git checkout [ ] js-spark adventure