Skip to content

SpellCorrect Exercise Solution

jasonbaldridge edited this page Feb 5, 2013 · 3 revisions

Here is a solution that covers all the steps in SpellCorrect-Exercise.

object SpellingCorrector {

  def main(args: Array[String]) {
    
    // The input sentence
    val input = args(0)
    
    // Get the vocabulary from one or both input files (when available)
    val firstVocab = getVocab(args(1)) 
    val secondVocab = if (args.length>2) getVocab(args(2)) else Set[String]()
    val vocab = firstVocab ++ secondVocab

    // A map from words to their counts. Can be used later to look up
    // vectors for candidates without needing to recompute the counts.
    // (Trading use of more space to make cosine computations faster.)
    val vocabVectors: Map[String,Map[String,Int]] = 
      vocab.map(word => (word, getNgramCounts(word))).toMap

    // Build the inverted index.
    val invertedIndex = vocabVectors
      .toSeq
      .flatMap { case(word, ngrams) => {
	ngrams.map { case(ngram, count) => (ngram,word) }.toSeq
      }}
      .groupBy(x=>x._1)
      .mapValues(_.map(_._2))
      .withDefault(x=>Seq[String]())

    // Process the input sentence.
    println("Detecting spelling errors in: " + input)
    input.split(" ").foreach { token => { 
      if (!vocab(token)) {
	println("ERROR: " + token)
 
	val typoNgrams = getNgramCounts(token)

	// Get all the candidates and output 10.
	val candidates = typoNgrams.keys.flatMap(invertedIndex).toSeq
	println("  Candidates: " + candidates.take(10).mkString(" "))

      }
    }}
  }

  // Get a word list from a file with one word per line
  def getVocab(filename: String) = 
    io.Source.fromFile(filename).getLines.toSet

  // Get the character ngrams in a word with their counts
  def getNgramCounts(word: String, size: Int = 3): Map[String,Int] =
    ("#"+word+"#")
      .sliding(size)
      .toSeq
      .groupBy(x=>x)
      .mapValues(_.length)
      .withDefault(x=>0)

}