Skip to content

Commit

Permalink
Code style fixes
Browse files Browse the repository at this point in the history
Change-Id: I402c69f8d1a9f4d8bf3515d220c2ae612d9de404
  • Loading branch information
smalyshev committed Dec 29, 2015
1 parent 198bb0c commit 705bf83
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 85 deletions.
98 changes: 51 additions & 47 deletions TextCat.php
Expand Up @@ -47,17 +47,17 @@ public function setMinFreq( $minFreq ) {
/**
* @param string $dir
*/
public function __construct($dir = null) {
if(empty($dir)) {
public function __construct( $dir = null ) {
if ( empty( $dir ) ) {
$dir = __DIR__."/LM";
}
$this->dir = $dir;
foreach(new DirectoryIterator($dir) as $file) {
if(!$file->isFile()) {
foreach ( new DirectoryIterator( $dir ) as $file ) {
if ( !$file->isFile() ) {
continue;
}
if($file->getExtension() == "lm") {
$this->langFiles[$file->getBasename(".lm")] = $file->getPathname();
if ( $file->getExtension() == "lm" ) {
$this->langFiles[$file->getBasename( ".lm" )] = $file->getPathname();
}
}
}
Expand All @@ -68,43 +68,45 @@ public function __construct($dir = null) {
* @param int $maxNgrams How many ngrams to use.
* @return int[]
*/
public function createLM($text, $maxNgrams) {
public function createLM( $text, $maxNgrams ) {
$ngram = array();
foreach(preg_split("/[{$this->wordSeparator}]+/u", $text) as $word) {
if(empty($word)) {
foreach ( preg_split( "/[{$this->wordSeparator}]+/u", $text ) as $word ) {
if ( empty( $word ) ) {
continue;
}
$word = "_".$word."_";
$len = mb_strlen($word, "UTF-8");
for($i=0;$i<$len;$i++) {
$len = mb_strlen( $word, "UTF-8" );
for ( $i=0;$i<$len;$i++ ) {
$rlen = $len - $i;
if($rlen > 4) {
@$ngram[mb_substr($word, $i, 5, "UTF-8")]++;
if ( $rlen > 4 ) {
@$ngram[mb_substr( $word, $i, 5, "UTF-8" )]++;
}
if($rlen > 3) {
@$ngram[mb_substr($word, $i, 4, "UTF-8")]++;
if ( $rlen > 3 ) {
@$ngram[mb_substr( $word, $i, 4, "UTF-8" )]++;
}
if($rlen > 2) {
@$ngram[mb_substr($word, $i, 3, "UTF-8")]++;
if ( $rlen > 2 ) {
@$ngram[mb_substr( $word, $i, 3, "UTF-8" )]++;
}
if($rlen > 1) {
@$ngram[mb_substr($word, $i, 2, "UTF-8")]++;
if ( $rlen > 1 ) {
@$ngram[mb_substr( $word, $i, 2, "UTF-8" )]++;
}
@$ngram[mb_substr($word, $i, 1, "UTF-8")]++;
@$ngram[mb_substr( $word, $i, 1, "UTF-8" )]++;
}
}
if($this->minFreq) {
if ( $this->minFreq ) {
$min = $this->minFreq;
$ngram = array_filter($ngram, function ($v) use($min) { return $v > $min; });
$ngram = array_filter( $ngram, function ( $v ) use( $min ) { return $v > $min;

} );
}
uksort( $ngram, function($k1, $k2) use($ngram) {
if($ngram[$k1] == $ngram[$k2]) {
return strcmp($k1, $k2);
uksort( $ngram, function( $k1, $k2 ) use( $ngram ) {
if ( $ngram[$k1] == $ngram[$k2] ) {
return strcmp( $k1, $k2 );
}
return $ngram[$k2] - $ngram[$k1];
});
if(count($ngram) > $maxNgrams) {
array_splice($ngram, $maxNgrams);
} );
if ( count( $ngram ) > $maxNgrams ) {
array_splice( $ngram, $maxNgrams );
}
return $ngram;
}
Expand All @@ -114,9 +116,9 @@ public function createLM($text, $maxNgrams) {
* @param string $langFile
* @return int[] Language file data
*/
public function loadLanguageFile($langFile) {
public function loadLanguageFile( $langFile ) {
include $langFile;
array_splice($ranks, $this->maxNgrams);
array_splice( $ranks, $this->maxNgrams );
return $ranks;
}

Expand All @@ -125,15 +127,17 @@ public function loadLanguageFile($langFile) {
* @param int[] $ngrams
* @param string $outfile Output filename
*/
public function writeLanguageFile($ngrams, $outfile) {
$out = fopen($outfile, "w");
public function writeLanguageFile( $ngrams, $outfile ) {
$out = fopen( $outfile, "w" );
// write original array as "$ngrams"
fwrite($out, '<?php $ngrams = ' . var_export($ngrams, true) . ";\n");
fwrite( $out, '<?php $ngrams = ' . var_export( $ngrams, true ) . ";\n" );
// write reduced array as "$ranks"
$rank = 1;
$ranks = array_map(function ($x) use(&$rank) { return $rank++; }, $ngrams);
fwrite($out, '$ranks = ' . var_export($ranks, true) . ";\n");
fclose($out);
$ranks = array_map( function ( $x ) use( &$rank ) { return $rank++;

}, $ngrams );
fwrite( $out, '$ranks = ' . var_export( $ranks, true ) . ";\n" );
fclose( $out );
}

/**
Expand All @@ -143,29 +147,29 @@ public function writeLanguageFile($ngrams, $outfile) {
* @return int[] Array with keys of language names and values of score.
* Sorted by ascending score, with first result being the best.
*/
public function classify($text, $candidates = null) {
$inputgrams = array_keys($this->createLM($text, $this->maxNgrams));
if($candidates) {
public function classify( $text, $candidates = null ) {
$inputgrams = array_keys( $this->createLM( $text, $this->maxNgrams ) );
if ( $candidates ) {
// flip for more efficient lookups
$candidates = array_flip($candidates);
$candidates = array_flip( $candidates );
}
$results = array();
foreach($this->langFiles as $language => $langFile) {
if($candidates && !isset($candidates[$language])) {
foreach ( $this->langFiles as $language => $langFile ) {
if ( $candidates && !isset( $candidates[$language] ) ) {
continue;
}
$ngrams = $this->loadLanguageFile($langFile);
$ngrams = $this->loadLanguageFile( $langFile );
$p = 0;
foreach($inputgrams as $i => $ingram) {
if( !empty($ngrams[$ingram]) ) {
$p += abs($ngrams[$ingram] - $i);
foreach ( $inputgrams as $i => $ingram ) {
if ( !empty( $ngrams[$ingram] ) ) {
$p += abs( $ngrams[$ingram] - $i );
} else {
$p += $this->maxNgrams;
}
}
$results[$language] = $p;
}
asort($results);
asort( $results );
return $results;
}
}
Expand Down
54 changes: 28 additions & 26 deletions catus.php
Expand Up @@ -4,9 +4,9 @@
*/
require_once __DIR__.'/TextCat.php';

$options = getopt('a:c:d:f:t:u:l:h');
$options = getopt( 'a:c:d:f:t:u:l:h' );

if(isset($options['h'])) {
if ( isset( $options['h'] ) ) {
$help = <<<HELP
{$argv[0]} [-d Dir] [-a Int] [-f Int] [-l Text] [-t Int] [-u Float]
Expand Down Expand Up @@ -37,52 +37,54 @@
HELP;
echo $help;
exit(0);
exit( 0 );
}

if(!empty($options['d'])) {
if ( !empty( $options['d'] ) ) {
$dir = $options['d'];
} else {
$dir = dirname(__FILE__)."/LM";
$dir = __DIR__."/LM";
}

$cat = new TextCat($dir);
$cat = new TextCat( $dir );

if(!empty($options['t'])) {
$cat->setMaxNgrams(intval($options['t']));
if ( !empty( $options['t'] ) ) {
$cat->setMaxNgrams( intval( $options['t'] ) );
}
if(!empty($options['f'])) {
$cat->setMinFreq(intval($options['f']));
if ( !empty( $options['f'] ) ) {
$cat->setMinFreq( intval( $options['f'] ) );
}

$input = isset($options['l']) ? $options['l'] : file_get_contents("php://stdin");
if(!empty($options['c'])) {
$result = $cat->classify($input, explode(",", $options['c']));
$input = isset( $options['l'] ) ? $options['l'] : file_get_contents( "php://stdin" );
if ( !empty( $options['c'] ) ) {
$result = $cat->classify( $input, explode( ",", $options['c'] ) );
} else {
$result = $cat->classify($input);
$result = $cat->classify( $input );
}

if(empty($result)) {
if ( empty( $result ) ) {
echo "No match found.\n";
exit(1);
exit( 1 );
}

if(!empty($options['u'])) {
$max = reset($result) * $options['u'];
if ( !empty( $options['u'] ) ) {
$max = reset( $result ) * $options['u'];
} else {
$max = reset($result) * 1.05;
$max = reset( $result ) * 1.05;
}

if(!empty($options['a'])) {
if ( !empty( $options['a'] ) ) {
$top = $options['a'];
} else {
$top = 10;
}
$result = array_filter($result, function ($res) use($max) { return $res < $max; });
if($result && count($result) <= $top) {
echo join(" or ", array_keys($result)) . "\n";
exit(0);
$result = array_filter( $result, function ( $res ) use( $max ) { return $res < $max;

} );
if ( $result && count( $result ) <= $top ) {
echo join( " or ", array_keys( $result ) ) . "\n";
exit( 0 );
} else {
echo "Can not determine language.\n";
exit(1);
}
exit( 1 );
}
2 changes: 1 addition & 1 deletion composer.json
Expand Up @@ -18,7 +18,7 @@
"test": [
"parallel-lint . --exclude vendor",
"phpunit tests/",
"phpcs -p -s ."
"phpcs -p -s"
]
}
}
20 changes: 10 additions & 10 deletions felis.php
Expand Up @@ -9,19 +9,19 @@
// TODO: add option to control model ngram count
$maxNgrams = 4000;

if($argc != 3) {
die("Use $argv[0] INPUTDIR OUTPUTDIR\n");
if ( $argc != 3 ) {
die( "Use $argv[0] INPUTDIR OUTPUTDIR\n" );
}
if(!file_exists($argv[2])) {
mkdir($argv[2], 0755, true);
if ( !file_exists( $argv[2] ) ) {
mkdir( $argv[2], 0755, true );
}
$cat = new TextCat($argv[2]);
$cat = new TextCat( $argv[2] );

foreach(new DirectoryIterator($argv[1]) as $file) {
if(!$file->isFile()) {
foreach ( new DirectoryIterator( $argv[1] ) as $file ) {
if ( !$file->isFile() ) {
continue;
}
$ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams);
$cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm");
$ngrams = $cat->createLM( file_get_contents( $file->getPathname() ), $maxNgrams );
$cat->writeLanguageFile( $ngrams, $argv[2] . "/" . $file->getBasename( ".txt" ) . ".lm" );
}
exit(0);
exit( 0 );
5 changes: 4 additions & 1 deletion phpcs.xml
@@ -1,8 +1,11 @@
<?xml version="1.0"?>
<ruleset name="textcat">
<rule ref="vendor/mediawiki/mediawiki-codesniffer/MediaWiki"/>
<file>.</file>
<exclude-pattern>vendor</exclude-pattern>
<exclude-pattern>LM</exclude-pattern>
<exclude-pattern>tests</exclude-pattern>
<rule ref="vendor/mediawiki/mediawiki-codesniffer/MediaWiki"/>
<rule ref="Generic.PHP.NoSilencedErrors.Discouraged">
<exclude name="Generic.PHP.NoSilencedErrors.Discouraged"/>
</rule>
</ruleset>

0 comments on commit 705bf83

Please sign in to comment.