Skip to content
Permalink
Browse files

rewrote the file parser to work consistently;

also found an error with the search results for files;
  • Loading branch information...
caseysoftware committed May 26, 2014
1 parent 01ed67f commit c1852d8863abc7a0802763627b6dd520447b58d8
Showing with 85 additions and 95 deletions.
  1. +77 −0 classes/w2p/FileSystem/Indexer.class.php
  2. +8 −95 modules/files/files.class.php
@@ -0,0 +1,77 @@
<?php
class w2p_FileSystem_Indexer
{
protected $query = null;
public function __construct(w2p_Database_Query $query)
{
$this->query = $query;
}
/**
* parse file for indexing
* @todo convert to using the FileSystem methods
*/
public function index(CFile $file) {
/* Workaround for indexing large files:
** Based on the value defined in config data,
** files with file_size greater than specified limit
** are not indexed for searching.
** Negative value :<=> no filesize limit
*/
$index_max_file_size = w2PgetConfig('index_max_file_size', 0);
if ($file->file_size > 0 && ($index_max_file_size < 0 || (int) $file->file_size <= $index_max_file_size * 1024)) {
// get the parser application
$parser = w2PgetConfig('parser_' . $file->file_type);
if (!$parser) {
$parser = w2PgetConfig('parser_default');
}
if (!$parser) {
return false;
}
// buffer the file
$file->_filepath = W2P_BASE_DIR . '/files/' . $file->file_project . '/' . $file->file_real_filename;
if (file_exists($file->_filepath)) {
$fp = fopen($file->_filepath, 'rb');
$x = fread($fp, $file->file_size);
fclose($fp);
$ignore = w2PgetSysVal('FileIndexIgnoreWords');
$ignore = $ignore['FileIndexIgnoreWords'];
$ignore = explode(',', $ignore);
$x = strtolower($x);
$x = preg_replace("/[^A-Za-z0-9 ]/", "", $x);
foreach ($ignore as $ignoreWord) {
$x = str_replace(" $ignoreWord ", ' ', $x);
}
$x = str_replace(' ', ' ', $x);
$words = explode(' ', $x);
foreach ($words as $index => $word)
{
if ('' == trim($word)) {
continue;
}
$q = $this->query;
$q->addTable('files_index');
$q->addInsert('file_id', $file->file_id);
$q->addInsert('word', $word);
$q->addInsert('word_placement', $index);
$q->exec();
$q->clear();
}
} else {
//TODO: if the file doesn't exist.. should we delete the db record?
}
}
$q = new w2p_Database_Query();
$q->addTable('files');
$q->addUpdate('file_indexed', 1);
$q->addWhere('file_id = '. $file->file_id);
$q->exec();
return count($words);
}
}
@@ -107,7 +107,9 @@ public function hook_cron()
foreach($unindexedFiles as $file_id => $notUsed) {
$this->load($file_id);
$this->indexStrings($this->_AppUI);
$indexer = new w2p_FileSystem_Indexer($this->_getQuery());
$indexer->index($this);
}
$this->indexer = false;
}
@@ -118,7 +120,7 @@ public function hook_search()
$search['table_alias'] = 'f';
$search['table_module'] = 'files';
$search['table_key'] = 'f.file_id'; // primary key in searched table
$search['table_link'] = 'index.php?m=files&a=addedit&file_id='; // first part of link
$search['table_link'] = 'index.php?m=files&a=view&file_id='; // first part of link
$search['table_title'] = 'Files';
$search['table_orderby'] = 'file_name, word_placement';
$search['search_fields'] = array('file_name', 'file_description',
@@ -268,99 +270,10 @@ protected function hook_postDelete()
parent::hook_postDelete();
}
/**
* parse file for indexing
* @todo convert to using the FileSystem methods
*/
public function indexStrings() {
$nwords_indexed = 0;
/* Workaround for indexing large files:
** Based on the value defined in config data,
** files with file_size greater than specified limit
** are not indexed for searching.
** Negative value :<=> no filesize limit
*/
$index_max_file_size = w2PgetConfig('index_max_file_size', 0);
if ($this->file_size > 0 && ($index_max_file_size < 0 || (int) $this->file_size <= $index_max_file_size * 1024)) {
// get the parser application
$parser = $this->_w2Pconfig['parser_' . $this->file_type];
if (!$parser) {
$parser = $this->_w2Pconfig['parser_default'];
}
if (!$parser) {
return false;
}
// buffer the file
$this->_filepath = W2P_BASE_DIR . '/files/' . $this->file_project . '/' . $this->file_real_filename;
if (file_exists($this->_filepath)) {
$fp = fopen($this->_filepath, 'rb');
$x = fread($fp, $this->file_size);
fclose($fp);
// parse it
$parser = $parser . ' ' . $this->_filepath;
$pos = strpos($parser, '/pdf');
/*
* TODO: I *really* hate using error surpression here and I would
* normally just detect if safe_mode is on and if it was, skip
* this call. Unfortunately, safe_mode has been deprecated in
* 5.3 and will be removed in 5.4
*/
if (false !== $pos) {
$x = @shell_exec(`$parser -`);
} else {
$x = @shell_exec(`$parser`);
}
// if nothing, return
if (strlen($x) < 1) {
return 0;
}
// remove punctuation and parse the strings
$x = str_replace(array('.', ',', '!', '@', '(', ')'), ' ', $x);
$warr = explode(' ', $x);
$wordarr = array();
$nwords = count($warr);
for ($x = 0; $x < $nwords; $x++) {
$newword = $warr[$x];
if (!preg_match('[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~]', $newword)
&& mb_strlen(mb_trim($newword)) > 2
&& !preg_match('[0-9]', $newword)) {
$wordarr[$newword] = $x;
}
}
// filter out common strings
$ignore = w2PgetSysVal('FileIndexIgnoreWords');
$ignore = str_replace(' ,', ',', $ignore);
$ignore = str_replace(', ', ',', $ignore);
$ignore = explode(',', $ignore);
foreach ($ignore as $w) {
unset($wordarr[$w]);
}
$nwords_indexed = count($wordarr);
// insert the strings into the table
while (list($key, $val) = each($wordarr)) {
$q = $this->_getQuery();
$q->addTable('files_index');
$q->addReplace('file_id', $this->file_id);
$q->addReplace('word', $key);
$q->addReplace('word_placement', $val);
$q->exec();
$q->clear();
}
} else {
//TODO: if the file doesn't exist.. should we delete the db record?
}
}
$q = $this->_getQuery();
$q->addTable('files');
$q->addUpdate('file_indexed', 1);
$q->addWhere('file_id = '. $this->file_id);
$q->exec();
return $nwords_indexed;
public function indexStrings()
{
$indexer = new w2p_FileSystem_Indexer($this->_getQuery());
$indexer->index($this);
}
//function notifies about file changing

0 comments on commit c1852d8

Please sign in to comment.
You can’t perform that action at this time.