In [16]:
def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            contents = file.read()
            print(contents)
    except Exception as e:
        print(f"An error occurred: {e}")

1. Process the citation.txt input file and output the number of papers published in each decade: 1970s, 1980s, 1990s, 2000s, 2010s, and 2020s.

In [17]:
%%file rc1.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRjob_assignment1(MRJob):
    def map_decades(self, _, data):
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}
        records = data.split('#')

        for record in records:
            if record.startswith('*'):
                paper_info['title'] = record[1:].strip() if len(record) > 0 else None
            elif record.startswith("t"):
                paper_info['year'] = int(record[1:].strip()) if len(record) > 1 else None

        if paper_info['year'] is not None:
            decade = (paper_info['year'] // 10) * 10
            yield (decade, 1)

    def combiner_decades(self, decade, counts):
        yield (decade, sum(counts))

    def reducer_decades(self, decade, counts):
        yield (decade, sum(counts))

    def steps(self):
        return [
            MRStep(
                mapper=self.map_decades,
                combiner=self.combiner_decades,
                reducer=self.reducer_decades
            )]

if __name__ == '__main__':
    MRjob_assignment1.run()


Writing rc1.py


In [None]:
!python rc1.py citation.txt > out1.txt

In [19]:
print('output of job1 : \n')
read_file('out1.txt')

output of job1 : 

1970	4
1980	3
1990	11
2000	82



2. Create an inverted index of the citation file. Your inverted index will output the year followed by the comma-separated list of the titles of the papers published in that year.

In [20]:
%%file rc2.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRjob_assignment2(MRJob):
    def map_index(self, _, data):
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}
        records = data.split('#')

        for record in records:
            if record.startswith('*'):
                paper_info['title'] = record[1:].strip() if len(record) > 0 else None
            elif record.startswith("t"):
                paper_info['year'] = int(record[1:].strip()) if len(record) > 1 else None

        if paper_info['year'] is not None:
            title = paper_info['title']
            year = paper_info['year']
            yield (year, title)

    def combiner_index(self, year, titles):
        titles_list = list(titles)
        filtered_titles = [title for title in titles_list if title is not None]
        yield year, ', '.join(filtered_titles)

    def reducer_index(self, year, filtered_titles):
        final_titles = [title for title in filtered_titles if title is not None]
        yield year, ', '.join(final_titles)

    def steps(self):
        return [
            MRStep(
                mapper=self.map_index,
                combiner=self.combiner_index,
                reducer=self.reducer_index
            )
        ]

if __name__ == '__main__':
    MRjob_assignment2.run()


Writing rc2.py


In [None]:
!python rc2.py citation.txt > out2.txt

In [22]:
print('Output of job 2 : \n')
read_file('out2.txt')

Output of job 2 : 

1973	"Notes from industry"
1975	"A control word model for detecting conflicts between microoperations"
1976	"Microprogramming for the hardware engineer"
1978	"Design team composition for high level language computer architectures"
1982	"Review of \"Bit-Slice Microprocessor Design by John Mick and James Brick\", McGraw-Hill Book Company, 1980"
1985	"Word Processing on Your MacIntosh"
1987	"Type Graphics and MacIntosh"
1991	"Tarski's World 3.0: Including the Macintosh TM Program (Center for the Study of Language and Information - Lecture Notes)"
1993	"Hyperstat: Macintosh Hypermedia for Analyzing Data and Learning Statistics"
1994	"At Ease With Performa, It's a Mad, Mad, Mad, Mad Mac/Book and Disk, Operations Research: Macintosh Version (Business Statistics Series)"
1995	"Internet and HTML Training on CD-ROM"
1996	"Fast k-NN Classification Rule Using Metrics on Space-Filling Curves, A New Quadtree Decomposition Reconstruction Method"
1997	"Multimedia Directory 1997, E

3. Produce a list of co-authors of each author in the given input file.

In [23]:
%%file rc3.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRjob_assignment3(MRJob):
    def map_coauthors(self, _, data):
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}
        records = data.split('#')

        for record in records:
            if record.startswith("@"):
                paper_info['authors'] = record[1:].strip() if len(record) > 0 else None
            
        if paper_info['authors'] is not None:
            authors = paper_info['authors'].split(',')
            for i in range(len(authors)):
                for j in range(i + 1, len(authors)):
                    yield (authors[i].strip(), authors[j].strip())
                    yield (authors[j].strip(), authors[i].strip())

    def reducer_coauthors(self, author, coauthors):
        coauthors_set = set(coauthors)
        yield author, list(coauthors_set)

    def steps(self):
        return [
            MRStep(
                mapper=self.map_coauthors,
                reducer=self.reducer_coauthors
            )
        ]

if __name__ == '__main__':
    MRjob_assignment3.run()


Writing rc3.py


In [None]:
!python rc3.py citation.txt > out3.txt

In [25]:
print('Output of job 3 : \n')
read_file('out3.txt')

Output of job 3 : 

"A. Krzyzak"	["E. Skubalska-Rafajtowicz"]
"Ahmed Hassan"	["Parminder Flora"]
"Alex Galis"	["Arto Tapani Juhola", "Danny Raz", "Joan Serrat-Fernandez"]
"Alexander Gelbukh"	["Carlos Alberto Reyes-Garcia"]
"Alice Redmond-neal"	["Marjorie M. K. Hlava"]
"Aline Maria Santos Andrade"	["Carlos Alberto Maziero", "Fl\u00e1vio Morais de Assis Silva", "Jo\u00e3o Gabriel Silva"]
"Amir Ahmad"	["Lipika Dey"]
"Amitabh Chaudhary"	["Christian Scheideler", "Ankur Bhargava", "Amitabha Bagchi", "David Eppstein"]
"Amitabha Bagchi"	["Christian Scheideler", "Ankur Bhargava", "David Eppstein", "Amitabh Chaudhary"]
"Andreas N\u00fcrnberger"	["Marcin Detyniecki"]
"Anita Kesavan"	["Neil Daswani"]
"Ankur Bhargava"	["Christian Scheideler", "Amitabha Bagchi", "David Eppstein", "Amitabh Chaudhary"]
"Arthur Greef"	["Hans J. Skovgaard", "Lars Dragheim Olsen", "Michael Fruergaard Pontoppidan", "Palle Agermark"]
"Arto Tapani Juhola"	["Danny Raz", "Joan Serrat-Fernandez", "Alex Galis"]
"Barry Smyth"	["

4. Find the average number of papers published each year.

In [26]:
%%file rc4.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRjob_assignment4(MRJob):
    def map_papers(self, _, data):
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}
        records = data.split('#')

        for record in records:
            if record.startswith("t"):
                paper_info['year'] = int(record[1:].strip()) if len(record) > 1 else None

        if paper_info['year'] is not None:
            yield (paper_info['year'], 1)

    def combiner_count_papers(self, year, counts):
        yield (year, sum(counts))

    def reducer_count_papers(self, year, counts):
        yield None, (year, sum(counts))

    def reducer_average_papers(self, _, year_counts):
        total_papers = 0
        total_years = 0

        for year, count in year_counts:
            total_papers += count
            total_years += 1

        average_papers_per_year = total_papers / total_years
        yield "Average Papers per Year:", average_papers_per_year

    def steps(self):
        return [
            MRStep(
                mapper=self.map_papers,
                combiner=self.combiner_count_papers,
                reducer=self.reducer_count_papers
            ),
            MRStep(
                reducer=self.reducer_average_papers
            )
        ]

if __name__ == '__main__':
    MRjob_assignment4.run()


Writing rc4.py


In [None]:
!python rc4.py citation.txt > out4.txt

In [28]:
print('Output of job 4 : \n')
read_file('out4.txt')

Output of job 4 : 

"Average Papers per Year:"	4.545454545454546



5. List the names of authors who have written the maximum number of papers.

In [29]:
%%file rc5.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRJob_assignment5(MRJob):

    def mapper(self, _, data):
        records = data.split('#')
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}

        for record in records:
            if record.startswith('*'):
                paper_info['title'] = record[1:].strip() if len(record) > 0 else None
            elif record.startswith("@"):
                paper_info['authors'] = record[1:].strip() if len(record) > 0 else None

        if paper_info['authors']:
            authors = paper_info['authors'].split(',')
            for author in authors:
                yield author.strip(), 1

    def combiner(self, author, counts):
        yield author, sum(counts)

    def reducer(self, author, counts):
        yield None, (author, sum(counts))

    def reducer_max_authors_init(self):
        self.max_count = 0
        self.max_authors = []

    def reducer_max_authors(self, _, author_counts):
        for author, count in author_counts:
            if author:
                if count > self.max_count:
                    self.max_count = count
                    self.max_authors = [author]
                elif count == self.max_count:
                    self.max_authors.append(author)

    def reducer_max_authors_final(self):
        if self.max_authors:
            yield (self.max_authors, self.max_count)

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                combiner=self.combiner,
                reducer=self.reducer
            ),
            MRStep(
                reducer_init=self.reducer_max_authors_init,
                reducer=self.reducer_max_authors,
                reducer_final=self.reducer_max_authors_final
            )
        ]

if __name__ == '__main__':
    MRJob_assignment5.run()


Writing rc5.py


In [None]:
!python rc5.py citation.txt > out5.txt

In [31]:
print('Output of job 5 : \n')
read_file('out5.txt')

Output of job 5 : 

["Cay S. Horstmann", "Charles J. Brooks"]	2



6. Find the names of authors who have written at most one paper in a year.

In [32]:
%%file rc6.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRJob_assignment6(MRJob):

    def mapper(self, _, data):
        records = data.split('#')
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}

        for record in records:
            if record.startswith("@"):
                paper_info['authors'] = record[1:].strip() if len(record) > 0 else None
            elif record.startswith("t"):
                paper_info['year'] = int(record[1:].strip()) if len(record) > 1 else None

        if paper_info['authors']:
            authors = paper_info['authors'].split(',')
            for author in authors:
                yield (paper_info['year'], author.strip()), 1

    def combiner(self, year_author, counts):
        yield year_author, sum(counts)

    def reducer(self, year_author, counts):
        total_count = sum(counts)
        year, author = year_author
        if total_count == 1:
            yield year, author

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                combiner=self.combiner,
                reducer=self.reducer
            )
        ]

if __name__ == '__main__':
    MRJob_assignment6.run()


Overwriting rc6.py


In [None]:
!python rc6.py citation.txt > out6.txt

In [34]:
print('Output of job 6 : \n')
read_file('out6.txt')

Output of job 6 : 

1973	"Stanley Habib"
1975	"Bruce Shriver"
1975	"Ted Lewis"
1976	"John R. Mick"
1978	"Charles S. Wetherell"
1978	"James R. McGraw"
1978	"Jr."
1978	"Lyle A. Cox"
1982	"William J. Tracz"
1985	"Rudolph Langer"
1987	"John Blaint"
1991	"John Etchemendy"
1991	"Jon Barwise"
1993	"David M. Lane"
1994	"Carla Rose"
1994	"Gene Orwell"
1994	"Wayne L. Winston"
1996	"A. Krzyzak"
1996	"E. Skubalska-Rafajtowicz"
1996	"J. Knipe"
1996	"X. Li"
1997	"W. E. Clason"
1999	"Donald Christiansen"
2000	"Ken Abernethy"
2002	"Brenden Munnelly"
2002	"Paul Holden"
2003	"Charles J. Brooks"
2003	"John Odam"
2003	"Jose Pedro Llamazares"
2003	"Michael Cloran"
2004	"Andreas N\u00fcrnberger"
2004	"Deborah Timmons"
2004	"Denise Seguin"
2004	"Derrick Story"
2004	"John Preston"
2004	"Marcin Detyniecki"
2004	"Michael T. Goodrich"
2004	"Nita Hewitt Rutkosky"
2004	"Sally Preston"
2004	"Shelley Gaskin"
2004	"Tom Collins"
2005	"Alice Redmond-neal"
2005	"Axel Bucker"
2005	"Carlito Vicencio"
2005	"Celso H. Podero

7. Find the title of papers such that their venue is not mentioned in the input file.

In [35]:
%%file rc7.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRJob_assignment7(MRJob):

    def mapper(self, _, data):
        records = data.split('#')
        paper_info = {'title': None, 'authors': None, 'year': None, 'venue': None, 'index_id': None}

        for record in records:
            if record.startswith('*'):
                paper_info['title'] = record[1:].strip() if len(record) > 0 else None
            elif record.startswith("c"):
                paper_info['venue'] = record[1:].strip() if len(record) > 1 else None

        if paper_info['venue'] is None and paper_info['title'] is not None:
            yield None, paper_info['title']

    def reducer(self, _, titles):
        title_list = list(titles)
        
        yield "Papers with No Venue:", title_list

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                reducer=self.reducer
            )
        ]

if __name__ == '__main__':
    MRJob_assignment7.run()


Writing rc7.py


In [None]:
!python rc7.py citation.txt > out7.txt

In [37]:
print('Output of job 7 : \n')
read_file('out7.txt')

Output of job 7 : 

"Papers with No Venue:"	["Automated Deduction in Geometry", "A+ Certification Core Hardware (Text & Lab Manual)", "Performance engineering in industry: current practices and adoption challenges", "Dude, You Can Do It! How to Build a Sweeet PC", "What Every Programmer Needs to Know about Security (Advances in information Security)", "Interpreting Kullback-Leibler divergence with the Neyman-earson lemma", "Digital Media: Transformations in Human Communication", "TOPP---the OpenMS proteomics pipeline", "Type Graphics and MacIntosh", "Adaptive Hypermedia and Adaptive Web-Based Systems", "Dependable Computing", "Calculus Early Transcendentals Single Variable", "Webbots, Spiders, and Screen Scrapers", "Fast k-NN Classification Rule Using Metrics on Space-Filling Curves", "Making the Digital City: The Early Shaping of Urban Internet Space (Design & the Built Environment S.)", "Linspire 5.0: The No Nonsense Guide! (No Nonsense Guide! series)", "Podcasting for Profit: A Prov