Skip to content

Commit

Permalink
Testing comment import
Browse files Browse the repository at this point in the history
  • Loading branch information
oncletom committed Aug 27, 2014
1 parent eaf6dbc commit 41ecb02
Show file tree
Hide file tree
Showing 3 changed files with 270 additions and 195 deletions.
182 changes: 91 additions & 91 deletions lib/Importer/Post.class.php
Expand Up @@ -162,6 +162,91 @@ public function extractPostContent($xpath) {
return $tmpDom;
}

public function extractComments($xpath) {
$comments = array();

foreach ($xpath->query("//*[@itemprop='comment']") as $commentNode) {
if (!$commentNode->hasAttribute('data-pid')) {
continue;
}

$data = array(
'__comment_id' => $commentNode->getAttribute('data-pid'),
'comment_approved' => 1,
'comment_karma' => 1,
'comment_post_ID' => $this->id,
'comment_author_email' => 'nobody@canalblog',
'comment_agent' => 'Canalblog Importer',
'comment_author_IP' => '127.0.0.1',
'comment_type' => 'comment',
'comment_author_url' => '',
);

/*
* Content
*/
$tmpdom = new DomDocument();
$tmpNode = $tmpdom->importNode($commentNode, true);
$tmpdom->appendChild($tmpNode);
$finder = new DomXpath($tmpdom);

foreach($finder->query("//h3") as $item) {
$item->parentNode->removeChild($item);
}

foreach($finder->query("//*[@class='itemfooter']") as $item) {
$item->parentNode->removeChild($item);
}

$data['comment_content'] = trim($tmpdom->textContent);
unset($tmpdom, $tmpnode);

/*
* Author
*/
$commentAuthor = $xpath->query("div[@class='itemfooter']/a", $commentNode);

if ($commentAuthor->length) {
$data['comment_author_url'] = $commentAuthor->item(0)->getAttribute('href');
$data['comment_author'] = $commentAuthor->item(0)->textContent;
}
else {
preg_match('#^Posté par (?P<author>[^,]+),#U', $xpath->query("div[@class='itemfooter']", $commentNode)->item(0)->textContent, $matches);

if (!empty($matches)) {
$data['comment_author'] = $matches['author'];
}
}

/*
* Date
*/
// Modern Mode
$commentDate = $xpath->query("//div[@class='itemfooter']/*[@class='timeago']", $commentNode);

if ($commentDate->length) {
$data['comment_date'] = str_replace('T', ' ', $commentDate->item(0)->getAttribute('title'));
}

// Legacy Mode
else {
$tmp = trim(str_replace(array("\r\n", "\r", "\n"), ' ', $xpath->query("div[@class='itemfooter']", $commentNode)->item(0)->textContent));
$tmp = str_replace(' ', ' ', $tmp);
preg_match('#, (le )?(?P<day>[^ ]+) (?P<month>[^ ]+) (?P<year>[^ ]+) (à|&agrave;?) (?P<hour>[^:]+):(?P<minute>.+)$#iUs', $tmp, $matches);
$matches['strptime'] = strptime(sprintf('%s %s %s %s:%s', $matches['day'], $matches['month'], $matches['year'], $matches['hour'], $matches['minute']), '%d %B %Y %H:%M');
$matches['month'] = sprintf('%02s', $matches['strptime']['tm_mon'] + 1);

$data['comment_date'] = sprintf('%s-%s-%s %s:%s:00', $matches['year'], $matches['month'], $matches['day'], $matches['hour'], $matches['minute']);
}

$data['comment_date_gmt'] = $data['comment_date'];

array_push($comments, $data);
}

return $comments;
}

public function isImageSrcPattern($src, $media_pattern, $host) {
$hostname = parse_url($host, PHP_URL_HOST);
$media_pattern['detection_pattern_inline'] = str_replace('%canalblog_domain%', $hostname, $media_pattern['detection_pattern_inline']);
Expand Down Expand Up @@ -325,6 +410,7 @@ public function savePost(DomDocument $dom)
*/
public function saveComments(DomDocument $dom, $html)
{
$xpath = new DomXpath($dom);
$stats = array('count' => 0, 'new' => 0, 'skipped' => 0, 'overwritten' => 0);

if ($this->data['comment_status'] == 'closed')
Expand All @@ -336,106 +422,20 @@ public function saveComments(DomDocument $dom, $html)
* Canalblog is only in french, hopefully for us (and me...)
*/
setlocale(LC_TIME, 'fr_FR.UTF-8', 'fr_FR@euro', 'fr_FR', 'fr', 'french');
$date_pattern = '%s %s %s %s:%s';

list($tmp, $html_comments) = explode('<a id="comments">', $html);
unset($tmpdom, $tmp);

preg_match_all('#<a id="c\d+"></a>(.+)<div class="itemfooter">.+</div>#siU', $html_comments, $matches);
$found_comments = $matches[0];
$found_comments = $this->extractComments($xpath);
$stats['count'] = count($found_comments);
unset($matches);

if (empty($found_comments))
{
if (empty($found_comments)) {
return $stats;
}

$comments = get_comments(array('post_id' => $this->id));

foreach ($found_comments as $commentHtml)
{
$commentDom = $this->getDomDocumentFromHtml($commentHtml);

$xpath = new DomXpath($commentDom);
$commentNode = $commentDom->getElementsByTagName('body')->item(0);

if ($xpath->query("a[@id]", $commentNode)->length === 0)
{
continue;
}

/*
* Determining Canalblog comment ID
*/
$canalblog_comment_id = $xpath->query("a[@id]", $commentNode)->item(0)->getAttribute('id');

$data = array(
'comment_approved' => 1,
'comment_karma' => 1,
'comment_post_ID' => $this->id,
'comment_author_email' => 'nobody@canalblog',
'comment_agent' => 'Canalblog Importer',
'comment_author_IP' => '127.0.0.1',
'comment_type' => 'comment',
'comment_author_url' => '',
);

/*
* Comment Title
* We agregate it in comment
*/
$tmpdom = new DomDocument();
if ($titleNode = $xpath->query('h3', $commentNode)->item(0))
{
$tmpnode = $tmpdom->createElement('p');
$tmpnode->appendChild($tmpdom->createElement('strong', esc_html($titleNode->textContent)));
$tmpdom->appendChild($tmpnode);
}

/*
* Comment content
* It's basically all direct <p>
*/
foreach ($xpath->query('//p', $commentNode) as $comment_p)
{
$tmpdom->appendChild($tmpdom->importNode($comment_p, true));
}

$data['comment_content'] = trim(preg_replace('#<p>[\s]*</p>#U', '', $tmpdom->saveHTML()));
unset($tmpdom, $tmpnode);

/*
* Comment footer
*/
$commentFooterNode = $xpath->query("div[@class='itemfooter']", $commentNode)->item(0);

//happens rarely, don't know why: we skip the import of this comment
if (null === $commentFooterNode)
{
$stats['skipped']++;
continue;
}

/*
* Comment author + URI + date
*/
if ($uriNode = $xpath->query("a", $commentFooterNode)->item(0))
{
$data['comment_author_url'] = $uriNode->getAttribute('href');
}
unset($uriNode);

$tmp = trim(str_replace(array("\r\n", "\r", "\n"), ' ', $commentFooterNode->textContent));
$tmp = str_replace(' ', ' ', $tmp);
preg_match('#^Post(é|&eacute;?) par (?P<comment_author>.+), (le )?(?P<day>[^ ]+) (?P<month>[^ ]+) (?P<year>[^ ]+) (à|&agrave;?) (?P<hour>[^:]+):(?P<minute>.+)$#iUs', $tmp, $matches);
$matches['strptime'] = strptime(sprintf($date_pattern, $matches['day'], $matches['month'], $matches['year'], $matches['hour'], $matches['minute']), '%d %B %Y %H:%M');
$matches['month'] = sprintf('%02s', $matches['strptime']['tm_mon'] + 1);

$data['comment_author'] = $matches['comment_author'];
$data['comment_date'] = sprintf('%s-%s-%s %s:%s:00', $matches['year'], $matches['month'], $matches['day'], $matches['hour'], $matches['minute']);
$data['comment_date_gmt'] = $data['comment_date'];
unset($matches);
foreach ($found_comments as $data) {
$canalblog_comment_id = $data['__comment_id'];
unset($data['__comment_id']);

/*
* Saving (only if not exists)
Expand Down

0 comments on commit 41ecb02

Please sign in to comment.