<a href="https://colab.research.google.com/github/wobias12/Nuxt/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TASK1 - Web Scraping**


In [None]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.10.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.1.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_

In [None]:
!scrapy startproject imdb_top250

New Scrapy project 'imdb_top250', using template directory '/usr/local/lib/python3.10/dist-packages/scrapy/templates/project', created in:
    /content/imdb_top250

You can start your first spider with:
    cd imdb_top250
    scrapy genspider example example.com


In [None]:
!ls imdb_top250/

imdb_top250  scrapy.cfg


In [None]:
%%writefile imdb_top250/imdb_top250/spiders/imdb_spider.py

#This code will only retrieve the information from the main website

import scrapy

class MovieSpider(scrapy.Spider):
    name = "movie_spider"
    start_urls = [
        'https://web.archive.org/web/20231021155922/https://www.imdb.com/chart/top/',
    ]

    def parse(self, response):

        for movie in response.css('div.ipc-metadata-list-summary-item__tc'):

            title = movie.css('h3.ipc-title__text::text').get()


            movie_metadata = movie.css('div.sc-c7e5f54-7.brlapf.cli-title-metadata span.sc-c7e5f54-8.hgjcbi.cli-title-metadata-item::text').getall()

            imdb_rating = movie.css('span.ipc-rating-star--imdb::attr(aria-label)').get()
            if imdb_rating:
                imdb_rating = imdb_rating.replace("IMDb rating: ", "")

            if len(movie_metadata) >= 3:
                year = movie_metadata[0]
                duration = movie_metadata[1]
                agerating = movie_metadata[2]

                yield {
                    'title': title,
                    'year': year,
                    'duration': duration,
                    'agerating': agerating,
                    'imdb_rating': imdb_rating
                }





Overwriting imdb_top250/imdb_top250/spiders/imdb_spider.py


In [None]:
%%writefile imdb_top250/imdb_top250/spiders/imdb_spider.py

#This code will retrieve the informations from the main site and from the movie subpages

import scrapy
import re

class MovieSpider(scrapy.Spider):
    name = "movie_spider"
    start_urls = [
        'https://web.archive.org/web/20231021155922/https://www.imdb.com/chart/top/',
    ]

    def parse(self, response):

        for movie in response.css('div.ipc-metadata-list-summary-item__tc'):

            title = movie.css('h3.ipc-title__text::text').get()


            movie_metadata = movie.css('div.sc-c7e5f54-7.brlapf.cli-title-metadata span.sc-c7e5f54-8.hgjcbi.cli-title-metadata-item::text').getall()

            imdb_rating = movie.css('span.ipc-rating-star--imdb::attr(aria-label)').get()
            if imdb_rating:
                imdb_rating = imdb_rating.replace("IMDb rating: ", "")

            relative_url = movie.css('a.ipc-title-link-wrapper::attr(href)').get()
            imdb_id = None
            full_url = None
            if relative_url:

                match = re.search(r'/title/(tt\d{7})/', relative_url)
                if match:
                    imdb_id = match.group(1)

                full_url = response.urljoin(relative_url)

            if len(movie_metadata) >= 3:
                year = movie_metadata[0]
                duration = movie_metadata[1]
                agerating = movie_metadata[2]

                yield response.follow(
                full_url,
                callback=self.parse_movie_page,
                meta={
                    'title': title,
                    'year': year,
                    'duration': duration,
                    'agerating': agerating,
                    'rating': imdb_rating,
                    'imdb_id': imdb_id,
                    'full_url': full_url
                }
            )

    def parse_movie_subpage(self, response):

        relative_poster_url = response.css("a.ipc-lockup-overlay::attr(href)").re_first(r"/mediaviewer/[^/]+")
        poster_url = None
        if relative_poster_url:

            imdb_id = response.meta['imdb_id']
            poster_url = f"/title/{imdb_id}{relative_poster_url}"

        plot = response.xpath('//span[@data-testid="plot-xs_to_m"]/text()').get()

        directors = response.xpath('//li[a[contains(@href, "ref_=tt_cl_dr")]]/a/text()').getall()

        cast = response.xpath('//a[@data-testid="title-cast-item__actor"]/text()').getall()

        yield {
            'title': response.meta['title'],
            'year': response.meta['year'],
            'duration': response.meta['duration'],
            'agerating': response.meta['agerating'],
            'rating': response.meta['rating'],
            'imdb_id': response.meta['imdb_id'],
            'full_url': response.meta['full_url'],
            'poster_url': poster_url,
            'plot': plot,
            'directors': directors,
            'cast': cast
        }

Overwriting imdb_top250/imdb_top250/spiders/imdb_spider.py


In [None]:
!scrapy runspider imdb_top250/imdb_top250/spiders/imdb_spider.py

2024-11-03 16:35:38 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-11-03 16:35:38 [scrapy.utils.log] INFO: Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.10.0, Python 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2024-11-03 16:35:38 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)
2024-11-03 16:35:38 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-11-03 16:35:38 [scrapy.extensions.telnet] INFO: Telnet Password: 743ea9fced701e8d
2024-11-03 16:35:38 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 's

**TASK2 - Information Retrieval System**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


with open('/content/sample_data/imdb-movies.jsonl') as file:
    movie_data = pd.read_json(file, lines=True)


movie_data['info_combined'] = movie_data.apply(
    lambda row: f"{row['title']} {row['plot']} {' '.join(map(str, row['directors']))} {' '.join(map(str, row['actors']))} {row['year']}",
    axis=1
)


lemmatizer = WordNetLemmatizer()

def advanced_prepare_text(text):
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = text.lower().split()
    lemmatized = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized)


movie_data['info_prepared'] = movie_data['info_combined'].apply(advanced_prepare_text)


vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
count_matrix = vectorizer.fit_transform(movie_data['info_prepared'])


def find_movie(query):

    query_prepared = advanced_prepare_text(query)

    query_vector = vectorizer.transform([query_prepared])

    similarities = cosine_similarity(query_vector, count_matrix).flatten()

    best_match_index = similarities.argmax()

    return {
        'title': movie_data.iloc[best_match_index]['title'],
        'year': movie_data.iloc[best_match_index]['year']
    }

# Enter here your queries
queries = [
   # "Action movie about a naval aviator starring Tom Cruise",
   # "A movie where a man witnesses his parents' death and then learns the art of fighting to confront injustice",
   # "An animation where a cowboy doll feels threatened when a new spaceman action figure becomes the top toy in a boy's bedroom.",
   # "A film about a Scottish warrior starring Mel Gibson."
]


def display_result(result):
    print(f"--- Movie Recommendation ---\nTitle: {result['title']}\nRelease Year: {result['year']}\n")

for query in queries:
    result = find_movie(query)
    display_result(result)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


--- Movie Recommendation ---
Title: 25. Saving Private Ryan
Release Year: 1998

--- Movie Recommendation ---
Title: 128. Batman Begins
Release Year: 2005

--- Movie Recommendation ---
Title: 76. Toy Story
Release Year: 1995

--- Movie Recommendation ---
Title: 78. Braveheart
Release Year: 1995

--- Movie Recommendation ---
Title: 19. Se7en
Release Year: 1995

