In [1]:
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np 

In [2]:
class CategoryNode:
    def __init__(self, category):
        self.category = category
        self.children = []
        self.pages = []
        self.texts = []
        self.vector = None
        
    def add_child(self, node):
        self.children.append(node)
                
    def __str__(self):
        return self.category
    

In [3]:
class CategoryTree:
    def __init__(self, vectorizer):
        self.root = CategoryNode("Root")
        self.corpus = []
        self.vectorizer = vectorizer
        
    def load_json(self, file):
        tree = json.load(open(file,'r'))
        tree = json.loads(tree)
        corpus = []
        print("Category Tree loading...")            

        for name in tree.keys():
            node = CategoryNode(name)
            self.root.add_child(node)
            node.pages = tree[name]['pages']
            node.texts = tree[name]['texts']
            corpus += node.texts
            node_children = set(tree[name]) - set(["pages","texts"])
            for name_1 in node_children:
                node_1 = CategoryNode(name_1)
                node.add_child(node_1)
                node_1.pages = tree[name][name_1]['pages']
                node_1.texts = tree[name][name_1]['texts']
                corpus += node_1.texts
                node_1_children = set(tree[name][name_1]) - set(["pages","texts"])
                for name_2 in node_1_children:
                    node_2 = CategoryNode(name_2)
                    node_1.add_child(node_2)
                    node_2.pages = tree[name][name_1][name_2]['pages']
                    node_2.texts = tree[name][name_1][name_2]['texts']
                    corpus += node_2.texts
        print("Vectorizer is fitting...")
        self.vectorizer.fit(corpus)
        
        del corpus
        del tree
        
        print("Transforming tree...")
        self.vectorize()
        
        print("Tree is ready!")
                    
    def vectorize(self):
        for node in self.root.children:
            try:
                node.vector = np.mean(self.vectorizer.transform(node.texts),axis=0)
            except ValueError:
                node.vector = np.zeros((1, 100000))
            for node_1 in node.children:
                try:
                    node_1.vector = np.mean(self.vectorizer.transform(node_1.texts),axis=0)
                except ValueError:
                    node_1.vector = np.zeros((1, 100000))
                for node_2 in node_1.children:
                    try:
                        node_2.vector = np.mean(self.vectorizer.transform(node_2.texts),axis=0)
                    except ValueError:
                        node_2.vector = np.zeros((1, 100000))
                        
    def search(self, words, similarity_metric):
        input_vector = self.vectorizer.transform(words)
        result = []
        maximum = 0, None
        for node in self.root.children:
            test = similarity_metric(node.vector,input_vector)[0][0]
            if test > maximum[0]:
                maximum = test, node
        result.append(maximum)
        
        maximum = 0, None
        for node in result[-1][1].children:
            test = similarity_metric(node.vector,input_vector)[0][0]
            if test > maximum[0]:
                maximum = test, node
        result.append(maximum)
        
        maximum = 0, None
        for node in result[-1][1].children:
            test = similarity_metric(node.vector,input_vector)[0][0]
            if test > maximum[0]:
                maximum = test, node
        result.append(maximum)
        
        return result
            

In [4]:
tree = CategoryTree(TfidfVectorizer(stop_words=list(stopwords.words('english')),
                                    max_features=100000))
tree.load_json('search_tree_with_text.json')

Category Tree loading...
Vectorizer is fitting...
Transforming tree...
Tree is ready!


In [None]:
article_text = """
NASHVILLE, Tenn. (AP) — Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on records and on TV, died Friday night. He was 81.

FILE - In this Oct. 24, 2017 file photo, Kenny Rogers poses with his star on the Music City Walk of Fame in Nashville, Tenn. Actor-singer Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on record and on TV died Friday night, March 20, 2020. He was 81. (AP Photo/Mark Humphrey, File)© Provided by Associated Press FILE - In this Oct. 24, 2017 file photo, Kenny Rogers poses with his star on the Music City Walk of Fame in Nashville, Tenn. Actor-singer Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on record and on TV died Friday night, March 20, 2020. He was 81. (AP Photo/Mark Humphrey, File)
He died at home in Sandy Springs, Georgia, representative Keith Hagan told The Associated Press. He was under hospice care and died of natural causes, Hagan said.

The Houston-born performer with the husky voice and silver beard sold tens of millions of records, won three Grammys and was the star of TV movies based on “The Gambler” and other songs, making him a superstar in the ‘70s and ’80s. Rogers thrived for some 60 years before retired from touring in 2017 at age 79. Despite his crossover success, he always preferred to be thought of as a country singer.

“You either do what everyone else is doing and you do it better, or you do what no one else is doing and you don’t invite comparison,” Rogers told The Associated Press in 2015. “And I chose that way because I could never be better than Johnny Cash or Willie or Waylon at what they did. So I found something that I could do that didn’t invite comparison to them. And I think people thought it was my desire to change country music. But that was never my issue.”

In this Oct. 27, 2013, file photo, country music star Kenny Rogers thanks the audience at the ceremony for the 2013 inductions into the Country Music Hall of Fame in Nashville, Tenn. Actor-singer Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on record and on TV died Friday night, March 20, 2020. He was 81. (AP Photo/Mark Zaleski, File)© Provided by Associated Press ADDS YEAR - FILE - In this Oct. 27, 2013, file photo, country music star Kenny Rogers thanks the audience at the ceremony for the 2013 inductions into the Country Music Hall of Fame in Nashville, Tenn. Actor-singer Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on record and on TV died Friday night, March 20, 2020. He was 81.
His “Islands in the Stream” duet partner Dolly Parton posted a video on Twitter on Saturday morning, choking up as she held a picture of the two of them together. “I loved Kenny with all my heart and my heart is broken and a big ole chunk of it is gone with him today," Parton said in the video.

“Kenny was one of those artists who transcended beyond one format and geographic borders,” says Sarah Trahern, chief executive officer of the Country Music Association. “He was a global superstar who helped introduce country music to audiences all around the world."

This May 17, 1989 file photo shows Kenny Rogers posing for a portrait in Los Angeles.   Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020.  (AP Photo/Bob Galbraith, File)© Provided by Associated Press FILE - This May 17, 1989 file photo shows Kenny Rogers posing for a portrait in Los Angeles. Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020. (AP Photo/Bob Galbraith, File)
Rogers was a five-time CMA Award winner, as well as the recipient of the CMA's Willie Nelson Lifetime Achievement Award in 2013, the same year he was inducted into the Country Music Hall of Fame. He received 10 awards from the Academy of Country Music. He sold more than 47 million records in the United States alone, according to the Recording Industry Association of America.

A true rags-to-riches story, Rogers was raised in public housing in Houston Heights with seven siblings. As a 20-year-old, he had a gold single called “That Crazy Feeling,” under the name Kenneth Rogers, but when that early success stalled, he joined a jazz group, the Bobby Doyle Trio, as a standup bass player.

This Feb. 20, 1978 file photo shows Kenny Rogers at his home in Brentwood, Calif.   Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020.  (AP Photo/Wally Fong, File)© Provided by Associated Press FILE - This Feb. 20, 1978 file photo shows Kenny Rogers at his home in Brentwood, Calif. Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020. (AP Photo/Wally Fong, File)
But his breakthrough came when he was asked to join the New Christy Minstrels, a folk group, in 1966. The band reformed as First Edition and scored a pop hit with the psychedelic song, “Just Dropped In (To See What Condition My Condition Was In).” Rogers and First Edition mixed country-rock and folk on songs like “Ruby, Don’t Take Your Love To Town,” a story of a Vietnam veteran begging his girlfriend to stay. 

After the group broke up in 1974, Rogers started his solo career and found a big hit with the sad country ballad “Lucille,” in 1977, which crossed over to the pop charts and earned Rogers his first Grammy. Suddenly the star, Rogers added hit after hit for more than a decade.

“The Gambler,” the Grammy-winning story song penned by Don Schlitz, came out in 1978 and became his signature song with a signature refrain: “You gotta know when to hold ‘em, know when to fold ’em.” The song spawned a hit TV movie of the same name and several more sequels featuring Rogers as professional gambler Brady Hawkes, and led to a lengthy side career for Rogers as a TV actor and host of several TV specials.

FILE - In this June 9, 2012, file photo, Kenny Rogers performs at the 2012 CMA Music Festival in Nashville, Tenn. Actor-singer Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on record and on TV died Friday night, March 20, 2020. He was 81. (Photo by Wade Payne/Invision/AP, File)© Provided by Associated Press FILE - In this June 9, 2012, file photo, Kenny Rogers performs at the 2012 CMA Music Festival in Nashville, Tenn. Actor-singer Kenny Rogers, the smooth, Grammy-winning balladeer who spanned jazz, folk, country and pop with such hits as “Lucille,” “Lady” and “Islands in the Stream” and embraced his persona as “The Gambler” on record and on TV died Friday night, March 20, 2020. He was 81. (Photo by Wade Payne/Invision/AP, File)
“I think the best that any songwriter could hope for is to have Kenny Rogers sing one of your songs,” said Schlitz, who also co-wrote the other Parton-Rogers duet “You Can’t Make Old Friends.” “He gave so many career songs to so many of us.”

Schlitz noted that some of Rogers’ biggest hits were songs that had been recorded previously, but his versions became the most popular. “The Gambler” had been recorded six other times before Rogers and “Ruby Don’t Take Your Love to Town,” by Mel Tillis, was also recorded by other artists before Rogers.

Other hits included “You Decorated My Life,” “Every Time Two Fools Collide” with Dottie West, “Don’t Fall In Love with a Dreamer” with Kim Carnes, and “Coward of the County.” One of his biggest successes was “Lady,” written by Lionel Richie, a chart topper for six weeks straight in 1980. Richie said in a 2017 interview with the AP that he often didn’t finish songs until he had already pitched them, which was the case for “Lady.”

FILE - In this March 22, 1979 file photo,   Kenny Rogers, center, rolls the dice at Regine's in New York.   Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020.  (AP Photo/Richard Drew, File)© Provided by Associated Press FILE - In this March 22, 1979 file photo, Kenny Rogers, center, rolls the dice at Regine's in New York. Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020. (AP Photo/Richard Drew, File)
“In the beginning, the song was called, ‘Baby,'” Richie said. “And because when I first sat with him, for the first 30 minutes, all he talked about was he just got married to a real lady. A country guy like him is married to a lady. So, he said, ‘By the way, what’s the name of the song?’” Richie replies: “Lady.”

FILE - In this Sept. 27, 1983 file photo, Country Music singers Dolly Parton and Kenny Rogers rehearse a song for their appearance on the TV show "Live... And in Person" in Los Angeles.  Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020.  (AP Photo/Doug Pizac, File)© Provided by Associated Press FILE - In this Sept. 27, 1983 file photo, Country Music singers Dolly Parton and Kenny Rogers rehearse a song for their appearance on the TV show "Live... And in Person" in Los Angeles. Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020. (AP Photo/Doug Pizac, File)
Over the years, Rogers worked often with female duet partners, most memorably, Dolly Parton. The two were paired at the suggestion of the Bee Gees’ Barry Gibb, who wrote “Islands in the Stream.”

“Barry was producing an album on me and he gave me this song,” Rogers told the AP in 2017. “And I went and learned it and went into the studio and sang it for four days. And I finally looked at him and said, ‘Barry, I don’t even like this song anymore.’ And he said, ‘You know what we need? We need Dolly Parton.’ I thought, ‘Man, that guy is a visionary.’”

In this Feb. 28, 1980 file photo, Kenny Rogers holds a Grammy Award he received during presentation in Los Angles.  Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020. (AP Photo/McLendon, File)© Provided by Associated Press FILE- In this Feb. 28, 1980 file photo, Kenny Rogers holds a Grammy Award he received during presentation in Los Angles. Rogers, who embodied “The Gambler” persona and whose musical career spanned jazz, folk, country and pop, has died at 81. A representative says Rogers died at home in Georgia on Friday, March 20, 2020. (AP Photo/McLendon, File)
Coincidentally, Parton was actually in the same recording studio in Los Angeles when the idea came up.

“From the moment she marched into that room, that song never sounded the same,” Rogers said. “It took on a whole new spirit.”

The two singers toured together, including in Australia and New Zealand in 1984 and 1987, and were featured in a HBO concert special. Over the years the two would continue to record together, including their last duet, “You Can’t Make Old Friends,” which was released in 2013. Parton reprised “Islands in the Stream” with Rogers during his all-star retirement concert held in Nashville in October 2017.

Rogers invested his time and money in a lot of other endeavors over his career, including a passion for photography that led to several books, as well as an autobiography, “Making It With Music.” He had a chain of restaurants called Kenny Rogers Roasters and was a partner behind a riverboat in Branson, Missouri. He was also involved in numerous charitable causes, among them the Red Cross and MusiCares, and was part of the all-star “We are the World” recording for famine relief.



By the '90s, his ability to chart hits had waned, although he still remained a popular live entertainer with regular touring. Still he was an inventive businessman and never stopped trying to find his way back onto the charts.
At the age of 61, Rogers had a brief comeback on the country charts in 2000 with a hit song “Buy Me A Rose,” thanks to his other favorite medium, television. Producers of the series “Touched By An Angel” wanted him to appear in an episode, and one of his managers suggested the episode be based on his latest single. That cross-promotional event earned him his first No. 1 country song in 13 years.

Rogers is survived by his wife, Wanda, and his sons Justin, Jordan, Chris and Kenny Jr., as well as two brothers, a sister and grandchildren, nieces and nephews, his representative said. The family is planning a private service “out of concern for the national COVID-19 emergency,” a statement posted early Saturday read. A public memorial will be held at a later date.
"""

In [5]:
for i in tree.search([article_text], cosine_similarity):
    print(i[1],"(similarity {})".format(i[0]))

Performing arts (similarity 0.08373589656786475)
Musical theatre (similarity 0.11155650158050945)
Songs from musicals (similarity 0.14089107521579694)
