Add support for scraping from Caribpr

mvdctop · May 18, 2023 · 4383491 · 4383491
1 parent 7dcc4c2
commit 4383491
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 2 deletions.
diff --git a/config.ini b/config.ini
@@ -58,7 +58,7 @@ image_naming_with_number = 0
 update_check = 1
 
 [priority]
-website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,getchu,javdb,gcolle,javday,javmenu
+website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,getchu,javdb,gcolle,javday,javmenu,caribpr
 
 [escape]
 literals = \()/

diff --git a/number_parser.py b/number_parser.py
@@ -99,6 +99,7 @@ def get_number(debug: bool, file_path: str) -> str:
     'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0],
     'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()),
     'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()),
+    'caribpr': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
 }
 
 

diff --git a/scrapinglib/api.py b/scrapinglib/api.py
@@ -34,7 +34,7 @@ class Scraping:
     """
     adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
                           'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
-                          'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu'
+                          'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu', 'caribpr'
                           ]
 
     general_full_sources = ['tmdb', 'imdb']
@@ -216,6 +216,9 @@ def insert(sources, source):
             if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
             ):
                 sources = insert(sources, "carib")
+            elif "caribpr" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
+            ):
+                sources = insert(sources, "caribpr")
             elif "item" in file_number or "GETCHU" in file_number.upper():
                 sources = insert(sources, "getchu")
             elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+",

diff --git a/scrapinglib/caribpr.py b/scrapinglib/caribpr.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+
+import re
+from urllib.parse import urljoin
+from lxml import html
+from .parser import Parser
+
+
+class Caribpr(Parser):
+    source = 'caribpr'
+
+    expr_title = "//div[@class='movie-info']/div[@class='section is-wide']/div[@class='heading']/h1/text()"
+    expr_release = "//li[2]/span[@class='spec-content']/text()"
+    expr_runtime = "//li[3]/span[@class='spec-content']/text()"
+    expr_actor = "//li[1]/span[@class='spec-content']/a[@class='spec-item']/text()"
+    expr_tags = "//li[5]/span[@class='spec-content']/a[@class='spec-item']/text()"
+    expr_extrafanart = "//div[@class='movie-gallery']/div[@class='section is-wide']/div[2]/div[@class='grid-item']/div/a/@href"
+    # expr_label = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
+    # expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
+    expr_outline = "//div[@class='movie-info']/div[@class='section is-wide']/p/text()"
+
+    def extraInit(self):
+        self.imagecut = 1
+        self.uncensored = True
+
+    def search(self, number):
+        self.number = number
+        if self.specifiedUrl:
+            self.detailurl = self.specifiedUrl
+        else:
+            self.detailurl = f'https://www.caribbeancompr.com/moviepages/{number}/index.html'
+        htmlcode = self.getHtml(self.detailurl)
+        if htmlcode == 404 or 'class="movie-info"' not in htmlcode:
+            return 404
+        htmltree = html.fromstring(htmlcode)
+        result = self.dictformat(htmltree)
+        return result
+
+    def getStudio(self, htmltree):
+        return '加勒比'
+
+    def getActors(self, htmltree):
+        r = []
+        actors = super().getActors(htmltree)
+        for act in actors:
+            if str(act) != '他':
+                r.append(act)
+        return r
+
+    def getNum(self, htmltree):
+        return self.number
+
+    def getCover(self, htmltree):
+        return f'https://www.caribbeancompr.com/moviepages/{self.number}/images/l_l.jpg'
+
+    def getExtrafanart(self, htmltree):
+        r = []
+        genres = self.getTreeAll(htmltree, self.expr_extrafanart)
+        for g in genres:
+            jpg = str(g)
+            if '/member/' in jpg:
+                break
+            else:
+                r.append(jpg)
+        return r
+
+    def getTrailer(self, htmltree):
+        return f'https://smovie.caribbeancompr.com/sample/movies/{self.number}/480p.mp4'
+
+    def getActorPhoto(self, htmltree):
+        htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
+        names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
+        t = {}
+        for name, a in zip(names, htmla):
+            if name.strip() == '他':
+                continue
+            p = {name.strip(): a.attrib['href']}
+            t.update(p)
+        o = {}
+        for k, v in t.items():
+            if '/search_act/' not in v:
+                continue
+            r = self.getHtml(urljoin('https://www.caribbeancompr.com', v), type='object')
+            if not r.ok:
+                continue
+            html = r.text
+            pos = html.find('.full-bg')
+            if pos<0:
+                continue
+            css = html[pos:pos+100]
+            cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
+            if not cssBGjpgs or not len(cssBGjpgs[0]):
+                continue
+            p = {k: urljoin(r.url, cssBGjpgs[0])}
+            o.update(p)
+        return o
+
+    def getOutline(self, htmltree):
+        if self.morestoryline:
+            from .storyline import getStoryline
+            result = getStoryline(self.number, uncensored=self.uncensored,
+                                  proxies=self.proxies, verify=self.verify)
+            if len(result):
+                return result
+        return super().getOutline(htmltree)
+