Added the newly refactored library.

technosophos · Feb 3, 2012 · 6a5a812 · 6a5a812
1 parent 690fba9
commit 6a5a812
Show file tree

Hide file tree

Showing 4 changed files with 426 additions and 0 deletions.
diff --git a/src/LibRIS/ParseException.php b/src/LibRIS/ParseException.php
@@ -0,0 +1,3 @@
+<?php
+namespace LibRIS;
+class ParseException extends \Exception {}
diff --git a/src/LibRIS/RISReader.php b/src/LibRIS/RISReader.php
@@ -0,0 +1,192 @@
+<?php
+/**
+ * This is a library for parsing RIS files.
+ *
+ * LibRIS::RISReader() is the main parser.
+ * LibRIS::RISWriter() can generate RIS data.
+ * LibRIS::RISTags() contains useful RIS information.
+ *
+ * @see http://www.refman.com/support/risformat_intro.asp
+ */
+
+namespace LibRIS;
+
+/**
+ * The main class for parsing RIS files.
+ *
+ * Usage:
+ * @code
+ * <?php
+ *
+ * use \LibRIS\RISReader;
+ *
+ * $reader = new RISReader();
+ *
+ * // Parse a file of RIS data.
+ * $reader->parseFile('path/to/file.ris');
+ *
+ * // Parse a string containing RIS data.
+ * $reader->parseString($someRisString);
+ *
+ * // Parse an array of lines.
+ * $reader->parseArray($arrayOfRISDirectives);
+ *
+ * // Get an associative array of records.
+ * $array = $reader->getRecords();
+ *
+ * // Dump the records to STDOUT
+ * $reader->printRecords();
+ *
+ * ?>
+ * @endcode
+ *
+ * The data structure generated by this class is of the form
+ * @code
+ * <?php
+ * array(
+ *   [0] => array(
+ *     'T1' => array('title one', 'title 2'),
+ *     'TY' => array('JOUR'),
+ *     // Other tags and their values.
+ *   ),
+ *   [1] => array(
+ *     'T1' => array('another entry'),
+ *     'TY' => array('JOUR'),
+ *   ),
+ * );
+ * ?>
+ * @endcode
+ */
+class RISReader {
+
+  const RIS_EOL = "\r\n";
+  const LINE_REGEX = '/^(([A-Z1-9]{2})\s+-(.*))|(.*)$/';
+
+  protected $data = NULL;
+
+  public function __construct($options = array()) {
+
+  }
+
+  /**
+   * Parse an RIS file.
+   *
+   * This will parse the file and return a data structure representing the
+   * record.
+   *
+   * @param string $filename
+   *  The full path to the file to parse.
+   * @param StreamContext $context
+   *  The stream context (in desired) for handling the file.
+   * @retval array
+   *  An indexed array of individual sources, each of which is an 
+   *  associative array of entry details. (See LibRIS)
+   */
+  public function parseFile($filename, $context = NULL) {
+    if (!is_file($filename)) {
+      throw new ParseException(sprintf('File %s not found.', htmlentities($filename)));
+    }
+    $flags = FILE_SKIP_EMPTY_LINES | FILE_TEXT;
+    $contents = file($filename, $flags, $context);
+
+    $this->parseArray($contents);
+  }
+
+  /**
+   * Parse a string of RIS data.
+   *
+   * This will parse an RIS record into a representative data structure.
+   *
+   * @param string $string
+   *  RIS-formatted data in a string.
+   * @param StreamContext $context
+   *  The stream context (in desired) for handling the file.
+   * @retval array
+   *  An indexed array of individual sources, each of which is an 
+   *  associative array of entry details. (See {@link LibRIS})
+   */
+  public function parseString($string) {
+    $contents = explode ("\r\n", $string);
+    $this->parseArray($contents);
+  }
+
+  /**
+   * Take an array of lines and parse them into an RIS record.
+   */
+  protected function parseArray($lines) {
+    $recordset = array();
+
+    // Do any cleaning and normalizing.
+    $this->cleanData($lines);
+
+    $record = array();
+    $lastTag = NULL;
+    foreach ($lines as $line) {
+      $line = trim($line);
+      $matches = array();
+
+      preg_match(self::LINE_REGEX, $line, $matches);
+      if (!empty($matches[3])) {
+        $lastTag = $matches[2];
+        $record[$matches[2]][] = trim($matches[3]);
+      }
+      // End record and prep a new one.
+      elseif (!empty($matches[2]) && $matches[2] == 'ER') {
+        $lastTag = NULL;
+        $recordset[] = $record;
+        $record = array();
+      }
+      elseif (!empty($matches[4])) {
+        // Append to the last one.
+        // We skip leading info (like BOMs).
+        if (!empty($lastTag)) {
+          $lastEntry = count($record[$lastTag]) - 1;
+          // We trim because some encoders add tabs or multiple spaces.
+          // Standard is silent on how this should be handled.
+          $record[$lastTag][$lastEntry] .= ' ' . trim($matches[4]);
+        }
+      }
+    }
+    if (!empty($record)) $recordset[] = $record;
+
+    $this->data = $recordset;
+  }
+
+  public function getRecords() {
+    return $this->data;
+  }
+
+  public function printRecords() {
+    $format = "%s:\n\t%s\n";
+    foreach ($this->data as $record) {
+      foreach ($record as $key => $values) {
+        foreach ($values as $value) {
+          printf($format, RISTags::describeTag($key), $value);
+        }
+      }
+
+      print PHP_EOL;
+    }
+  }
+
+  /**
+   * Clean up the data before processing.
+   *
+   * @param array $lines
+   *   Indexed array of lines of data.
+   */
+  protected function cleanData(&$lines) {
+
+    if (empty($lines)) return;
+
+    // Currently, we only need to strip a BOM if it exists.
+    // Thanks to Derik Badman (http://madinkbeard.com/) for finding the
+    // bug and suggesting this fix:
+    // http://blog.philipp-michels.de/?p=32
+    $first = $lines[0];
+    if (substr($first, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {
+      $lines[0] = substr($first, 3);
+    }
+  }
+
+}
diff --git a/src/LibRIS/RISTags.php b/src/LibRIS/RISTags.php
@@ -0,0 +1,168 @@
+<?php
+
+namespace LibRIS;
+
+class RISTags {
+
+  public static function getTags() {
+    return array_keys(self::$tagMap);
+  }
+
+  public static function getTypes() {
+    return array_keys(self::$typeMap);
+  }
+
+  public static function describeTag($tag) {
+    return self::$tagMap[$tag];
+  }
+
+  public static function describeType($type) {
+    return self::$typeMap[$type];
+  }
+
+  /**
+   * The definitive list of all fields.
+   * @var array
+   * @see http://en.wikipedia.org/wiki/RIS_%28file_format%29
+   * @see http://www.refman.com/support/risformat_intro.asp
+   */
+   public static $tagMap = array(
+     'TY' => 'Type',
+     'ID' => 'Reference ID',
+     'T1' => 'Title',
+     'TI' => 'Book title',
+     'CT' => 'Title of unpublished reference',
+     'A1' => 'Primary author',
+     'A2' => 'Secondary author',
+     'AU' => 'Author',
+     'Y1' => 'Primary date',
+     'PY' => 'Publication year',
+     'N1' => 'Notes',
+     'KW' => 'Keywords',
+     'RP' => 'Reprint status',
+     'SP' => 'Start page',
+     'EP' => 'Ending page',
+     'JF' => 'Periodical full name',
+     'JO' => 'Periodical standard abbreviation',
+     'JA' => 'Periodical in which article was published',
+     'J1' => 'Periodical name - User abbreviation 1',
+     'J2' => 'Periodical name - User abbreviation 2',
+     'VL' => 'Volume',
+     'IS' => 'Issue',
+     'T2' => 'Title secondary',
+     'CY' => 'City of Publication',
+     'PB' => 'Publisher',
+     'U1' => 'User 1',
+     'U2' => 'User 2',
+     'U3' => 'User 3',
+     'U4' => 'User 4',
+     'U5' => 'User 5',
+     'T3' => 'Title series',
+     'N2' => 'Abstract',
+     'SN' => 'ISSN/ISBN/ASIN',
+     'AV' => 'Availability',
+     'M1' => 'Misc. 1',
+     'M2' => 'Misc. 2',
+     'M3' => 'Misc. 3',
+     'AD' => 'Address',
+     'UR' => 'URL',
+     'L1' => 'Link to PDF',
+     'L2' => 'Link to Full-text',
+     'L3' => 'Related records',
+     'L4' => 'Images',
+     'ER' => 'End of Reference',
+
+     // Unsure about the origin of these
+     'Y2' => 'Primary date 2',
+     'BT' => 'Institution [?]',
+   ); 
+
+  public static $tagDescriptions = array(
+    'TY' => 'Type of reference (must be the first tag)',
+    'ID' => 'Reference ID (not imported to reference software)',
+    'T1' => 'Primary title',
+    'TI' => 'Book title',
+    'CT' => 'Title of unpublished reference',
+    'A1' => 'Primary author',
+    'A2' => 'Secondary author (each name on separate line)',
+    'AU' => 'Author (syntax. Last name, First name, Suffix)',
+    'Y1' => 'Primary date',
+    'PY' => 'Publication year (YYYY/MM/DD)',
+    'N1' => 'Notes ',
+    'KW' => 'Keywords (each keyword must be on separate line preceded KW -)',
+    'RP' => 'Reprint status (IN FILE, NOT IN FILE, ON REQUEST (MM/DD/YY))',
+    'SP' => 'Start page number',
+    'EP' => 'Ending page number',
+    'JF' => 'Periodical full name',
+    'JO' => 'Periodical standard abbreviation',
+    'JA' => 'Periodical in which article was published',
+    'J1' => 'Periodical name - User abbreviation 1',
+    'J2' => 'Periodical name - User abbreviation 2',
+    'VL' => 'Volume number',
+    'IS' => 'Issue number',
+    'T2' => 'Title secondary',
+    'CY' => 'City of Publication',
+    'PB' => 'Publisher',
+    'U1' => 'User definable 1',
+    'U2' => 'User definable 2',
+    'U3' => 'User definable 3',
+    'U4' => 'User definable 4',
+    'U5' => 'User definable 5',
+    'T3' => 'Title series',
+    'N2' => 'Abstract',
+    'SN' => 'ISSN/ISBN (e.g. ISSN XXXX-XXXX)',
+    'AV' => 'Availability',
+    'M1' => 'Misc. 1',
+    'M2' => 'Misc. 2',
+    'M3' => 'Misc. 3',
+    'AD' => 'Address',
+    'UR' => 'Web/URL',
+    'L1' => 'Link to PDF',
+    'L2' => 'Link to Full-text',
+    'L3' => 'Related records',
+    'L4' => 'Images',
+    'ER' => 'End of Reference (must be the last tag)',
+  );
+
+  /**
+   * Map of all types (tag TY) defined for RIS.
+   * @var array
+   * @see http://en.wikipedia.org/wiki/RIS_%28file_format%29
+   * @see http://www.refman.com/support/risformat_intro.asp
+   */
+  public static $typeMap = array(
+    'ABST' => 'Abstract',
+    'ADVS' => 'Audiovisual material',
+    'ART' => 'Art Work',
+    'BOOK' => 'Whole book',
+    'CASE' => 'Case',
+    'CHAP' => 'Book chapter',
+    'COMP' => 'Computer program',
+    'CONF' => 'Conference proceeding',
+    'CTLG' => 'Catalog',
+    'DATA' => 'Data file',
+    'ELEC' => 'Electronic Citation',
+    'GEN' => 'Generic',
+    'HEAR' => 'Hearing',
+    'ICOMM' => 'Internet Communication',
+    'INPR' => 'In Press',
+    'JFULL' => 'Journal (full)',
+    'JOUR' => 'Journal',
+    'MAP' => 'Map',
+    'MGZN' => 'Magazine article',
+    'MPCT' => 'Motion picture',
+    'MUSIC' => 'Music score',
+    'NEWS' => 'Newspaper',
+    'PAMP' => 'Pamphlet',
+    'PAT' => 'Patent',
+    'PCOMM' => 'Personal communication',
+    'RPRT' => 'Report',
+    'SER' => 'Serial publication',
+    'SLIDE' => 'Slide',
+    'SOUND' => 'Sound recording',
+    'STAT' => 'Statute',
+    'THES' => 'Thesis/Dissertation',
+    'UNPB' => 'Unpublished work',
+    'VIDEO' => 'Video recording',
+  );
+}