From 705bf83730603bec4741b81c9c467cf30e451f22 Mon Sep 17 00:00:00 2001 From: Stanislav Malyshev Date: Tue, 29 Dec 2015 13:05:33 -0800 Subject: [PATCH] Code style fixes Change-Id: I402c69f8d1a9f4d8bf3515d220c2ae612d9de404 --- TextCat.php | 98 +++++++++++++++++++++++++++------------------------ catus.php | 54 ++++++++++++++-------------- composer.json | 2 +- felis.php | 20 +++++------ phpcs.xml | 5 ++- 5 files changed, 94 insertions(+), 85 deletions(-) diff --git a/TextCat.php b/TextCat.php index 3dcc473..2749960 100644 --- a/TextCat.php +++ b/TextCat.php @@ -47,17 +47,17 @@ public function setMinFreq( $minFreq ) { /** * @param string $dir */ - public function __construct($dir = null) { - if(empty($dir)) { + public function __construct( $dir = null ) { + if ( empty( $dir ) ) { $dir = __DIR__."/LM"; } $this->dir = $dir; - foreach(new DirectoryIterator($dir) as $file) { - if(!$file->isFile()) { + foreach ( new DirectoryIterator( $dir ) as $file ) { + if ( !$file->isFile() ) { continue; } - if($file->getExtension() == "lm") { - $this->langFiles[$file->getBasename(".lm")] = $file->getPathname(); + if ( $file->getExtension() == "lm" ) { + $this->langFiles[$file->getBasename( ".lm" )] = $file->getPathname(); } } } @@ -68,43 +68,45 @@ public function __construct($dir = null) { * @param int $maxNgrams How many ngrams to use. * @return int[] */ - public function createLM($text, $maxNgrams) { + public function createLM( $text, $maxNgrams ) { $ngram = array(); - foreach(preg_split("/[{$this->wordSeparator}]+/u", $text) as $word) { - if(empty($word)) { + foreach ( preg_split( "/[{$this->wordSeparator}]+/u", $text ) as $word ) { + if ( empty( $word ) ) { continue; } $word = "_".$word."_"; - $len = mb_strlen($word, "UTF-8"); - for($i=0;$i<$len;$i++) { + $len = mb_strlen( $word, "UTF-8" ); + for ( $i=0;$i<$len;$i++ ) { $rlen = $len - $i; - if($rlen > 4) { - @$ngram[mb_substr($word, $i, 5, "UTF-8")]++; + if ( $rlen > 4 ) { + @$ngram[mb_substr( $word, $i, 5, "UTF-8" )]++; } - if($rlen > 3) { - @$ngram[mb_substr($word, $i, 4, "UTF-8")]++; + if ( $rlen > 3 ) { + @$ngram[mb_substr( $word, $i, 4, "UTF-8" )]++; } - if($rlen > 2) { - @$ngram[mb_substr($word, $i, 3, "UTF-8")]++; + if ( $rlen > 2 ) { + @$ngram[mb_substr( $word, $i, 3, "UTF-8" )]++; } - if($rlen > 1) { - @$ngram[mb_substr($word, $i, 2, "UTF-8")]++; + if ( $rlen > 1 ) { + @$ngram[mb_substr( $word, $i, 2, "UTF-8" )]++; } - @$ngram[mb_substr($word, $i, 1, "UTF-8")]++; + @$ngram[mb_substr( $word, $i, 1, "UTF-8" )]++; } } - if($this->minFreq) { + if ( $this->minFreq ) { $min = $this->minFreq; - $ngram = array_filter($ngram, function ($v) use($min) { return $v > $min; }); + $ngram = array_filter( $ngram, function ( $v ) use( $min ) { return $v > $min; + + } ); } - uksort( $ngram, function($k1, $k2) use($ngram) { - if($ngram[$k1] == $ngram[$k2]) { - return strcmp($k1, $k2); + uksort( $ngram, function( $k1, $k2 ) use( $ngram ) { + if ( $ngram[$k1] == $ngram[$k2] ) { + return strcmp( $k1, $k2 ); } return $ngram[$k2] - $ngram[$k1]; - }); - if(count($ngram) > $maxNgrams) { - array_splice($ngram, $maxNgrams); + } ); + if ( count( $ngram ) > $maxNgrams ) { + array_splice( $ngram, $maxNgrams ); } return $ngram; } @@ -114,9 +116,9 @@ public function createLM($text, $maxNgrams) { * @param string $langFile * @return int[] Language file data */ - public function loadLanguageFile($langFile) { + public function loadLanguageFile( $langFile ) { include $langFile; - array_splice($ranks, $this->maxNgrams); + array_splice( $ranks, $this->maxNgrams ); return $ranks; } @@ -125,15 +127,17 @@ public function loadLanguageFile($langFile) { * @param int[] $ngrams * @param string $outfile Output filename */ - public function writeLanguageFile($ngrams, $outfile) { - $out = fopen($outfile, "w"); + public function writeLanguageFile( $ngrams, $outfile ) { + $out = fopen( $outfile, "w" ); // write original array as "$ngrams" - fwrite($out, 'createLM($text, $this->maxNgrams)); - if($candidates) { + public function classify( $text, $candidates = null ) { + $inputgrams = array_keys( $this->createLM( $text, $this->maxNgrams ) ); + if ( $candidates ) { // flip for more efficient lookups - $candidates = array_flip($candidates); + $candidates = array_flip( $candidates ); } $results = array(); - foreach($this->langFiles as $language => $langFile) { - if($candidates && !isset($candidates[$language])) { + foreach ( $this->langFiles as $language => $langFile ) { + if ( $candidates && !isset( $candidates[$language] ) ) { continue; } - $ngrams = $this->loadLanguageFile($langFile); + $ngrams = $this->loadLanguageFile( $langFile ); $p = 0; - foreach($inputgrams as $i => $ingram) { - if( !empty($ngrams[$ingram]) ) { - $p += abs($ngrams[$ingram] - $i); + foreach ( $inputgrams as $i => $ingram ) { + if ( !empty( $ngrams[$ingram] ) ) { + $p += abs( $ngrams[$ingram] - $i ); } else { $p += $this->maxNgrams; } } $results[$language] = $p; } - asort($results); + asort( $results ); return $results; } } diff --git a/catus.php b/catus.php index 0df2da0..c3ccedf 100644 --- a/catus.php +++ b/catus.php @@ -4,9 +4,9 @@ */ require_once __DIR__.'/TextCat.php'; -$options = getopt('a:c:d:f:t:u:l:h'); +$options = getopt( 'a:c:d:f:t:u:l:h' ); -if(isset($options['h'])) { +if ( isset( $options['h'] ) ) { $help = <<setMaxNgrams(intval($options['t'])); +if ( !empty( $options['t'] ) ) { + $cat->setMaxNgrams( intval( $options['t'] ) ); } -if(!empty($options['f'])) { - $cat->setMinFreq(intval($options['f'])); +if ( !empty( $options['f'] ) ) { + $cat->setMinFreq( intval( $options['f'] ) ); } -$input = isset($options['l']) ? $options['l'] : file_get_contents("php://stdin"); -if(!empty($options['c'])) { - $result = $cat->classify($input, explode(",", $options['c'])); +$input = isset( $options['l'] ) ? $options['l'] : file_get_contents( "php://stdin" ); +if ( !empty( $options['c'] ) ) { + $result = $cat->classify( $input, explode( ",", $options['c'] ) ); } else { - $result = $cat->classify($input); + $result = $cat->classify( $input ); } -if(empty($result)) { +if ( empty( $result ) ) { echo "No match found.\n"; - exit(1); + exit( 1 ); } -if(!empty($options['u'])) { - $max = reset($result) * $options['u']; +if ( !empty( $options['u'] ) ) { + $max = reset( $result ) * $options['u']; } else { - $max = reset($result) * 1.05; + $max = reset( $result ) * 1.05; } -if(!empty($options['a'])) { +if ( !empty( $options['a'] ) ) { $top = $options['a']; } else { $top = 10; } -$result = array_filter($result, function ($res) use($max) { return $res < $max; }); -if($result && count($result) <= $top) { - echo join(" or ", array_keys($result)) . "\n"; - exit(0); +$result = array_filter( $result, function ( $res ) use( $max ) { return $res < $max; + +} ); +if ( $result && count( $result ) <= $top ) { + echo join( " or ", array_keys( $result ) ) . "\n"; + exit( 0 ); } else { echo "Can not determine language.\n"; - exit(1); -} \ No newline at end of file + exit( 1 ); +} diff --git a/composer.json b/composer.json index e52e12d..e5c336c 100644 --- a/composer.json +++ b/composer.json @@ -18,7 +18,7 @@ "test": [ "parallel-lint . --exclude vendor", "phpunit tests/", - "phpcs -p -s ." + "phpcs -p -s" ] } } diff --git a/felis.php b/felis.php index 9f83f67..33e96f7 100644 --- a/felis.php +++ b/felis.php @@ -9,19 +9,19 @@ // TODO: add option to control model ngram count $maxNgrams = 4000; -if($argc != 3) { - die("Use $argv[0] INPUTDIR OUTPUTDIR\n"); +if ( $argc != 3 ) { + die( "Use $argv[0] INPUTDIR OUTPUTDIR\n" ); } -if(!file_exists($argv[2])) { - mkdir($argv[2], 0755, true); +if ( !file_exists( $argv[2] ) ) { + mkdir( $argv[2], 0755, true ); } -$cat = new TextCat($argv[2]); +$cat = new TextCat( $argv[2] ); -foreach(new DirectoryIterator($argv[1]) as $file) { - if(!$file->isFile()) { +foreach ( new DirectoryIterator( $argv[1] ) as $file ) { + if ( !$file->isFile() ) { continue; } - $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams); - $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm"); + $ngrams = $cat->createLM( file_get_contents( $file->getPathname() ), $maxNgrams ); + $cat->writeLanguageFile( $ngrams, $argv[2] . "/" . $file->getBasename( ".txt" ) . ".lm" ); } -exit(0); \ No newline at end of file +exit( 0 ); diff --git a/phpcs.xml b/phpcs.xml index 874d386..f79e751 100644 --- a/phpcs.xml +++ b/phpcs.xml @@ -1,8 +1,11 @@ - . vendor LM tests + + + +