In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import PorterStemmer

In [2]:
# 1 Text Data Parsing and Vocabulary Selection(15 points)

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\\', ' ')
    text = re.sub(f'[{string.punctuation}]', '', text)
    text = re.sub(r'\d+', '', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

def create_vocabulary(text, top_n=200):
    stemmer = PorterStemmer()
    
    vocabulary = []
    cnt = defaultdict(int)
    for t in text:
        wds = t.split()
        for wd in wds:
            w = stemmer.stem(wd)
            cnt[w] += 1
    for k in cnt:
        vocabulary.append((k, cnt[k]))
    vocabulary.sort(reverse=True, key=lambda x: x[1])
    vocabulary = vocabulary[:top_n]
    return [v[0] for v in vocabulary]

In [4]:
data_path = './CharCnn_Keras/data/ag_news_csv/train.csv'
test_data_path = './CharCnn_Keras/data/ag_news_csv/test.csv'

df = pd.read_csv(data_path, names=['class', 'title', 'description']).sample(12000)
df['description'] = df['description'].apply(preprocess_text)

test_df = pd.read_csv(test_data_path, names=['class', 'title', 'description'])
test_df['description'] = test_df['description'].apply(preprocess_text)

vocabulary = create_vocabulary(df['description'])
print(vocabulary)

['said', 'new', 'us', 'reuter', 'year', 'compani', 'ap', 'first', 'two', 'monday', 'wednesday', 'world', 'tuesday', 'thursday', 'report', 'inc', 'one', 'friday', 'york', 'yesterday', 'state', 'game', 'week', 'quot', 'million', 'last', 'presid', 'say', 'nation', 'unit', 'time', 'corp', 'offici', 'govern', 'plan', 'peopl', 'day', 'would', 'secur', 'today', 'price', 'sunday', 'could', 'oil', 'group', 'servic', 'announc', 'team', 'three', 'kill', 'win', 'market', 'open', 'season', 'saturday', 'month', 'iraq', 'softwar', 'second', 'end', 'night', 'percent', 'make', 'lead', 'next', 'back', 'busi', 'comput', 'microsoft', 'minist', 'countri', 'intern', 'may', 'system', 'use', 'internet', 'billion', 'former', 'record', 'expect', 'american', 'citi', 'share', 'technolog', 'network', 'washington', 'research', 'relea', 'take', 'stock', 'help', 'leader', 'oper', 'start', 'final', 'forc', 'european', 'feder', 'home', 'victori', 'run', 'top', 'set', 'sale', 'player', 'san', 'giant', 'manag', 'maker', 

In [5]:
# 2 Document Relevance with Vector Space Basic Model (25 points)

In [6]:
def preprocess_queries(queries):
    stemmer = PorterStemmer()
    for i, query in enumerate(queries):
        wds = query.split()
        queries[i] = ' '.join([stemmer.stem(wd) for wd in wds])
        
    doc_freq = {}
    test_doc_freq = {}
    for query in queries:
        wds = query.split()
        for w in wds:
            doc_freq[w] = sum([1 for d in df['description'] if w in d.split()])
            test_doc_freq[w] = sum([1 for d in test_df['description'] if w in d.split()])
    return doc_freq, test_doc_freq

queries = ["olympic gold athens", "reuters stocks friday", "investment market prices"]
doc_freq, test_doc_freq = preprocess_queries(queries)

In [7]:
def compute_relevance_VSB(query, doc, normalization=False):
    wds = doc.split()
    score = 0
    for wd in query.split():
        if wd in wds:
            score += 1
    if normalization:
        doc_length = len(doc.split())
        if doc_length > 0:
            score /= doc_length
        else:
            score = 0
    return score

In [8]:
# 2.1 train set
for query in queries:
    scores = []
    for i, doc in enumerate(df['description']):
        scores.append((i, compute_relevance_VSB(query, doc)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 10:")
    for i, score in scores[:10]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print("Bottom 10:")
    for i, score in scores[-10:]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 10:
Index: 66, Score: 3, Description: ireland look certain strip gold medal athen olymp nation popular hors waterford crystal test posit ban drug
Index: 687, Score: 3, Description: athen tom pappa us gold medal favorit decathlon saw quest olymp crown dash yesterday seriou foot injuri forc withdraw pole vault portion event competit
Index: 748, Score: 3, Description: irish showjump cian connor face lose olymp gold medal btest blood sampl confirm trace ban substanc hors rode victori athen
Index: 829, Score: 3, Description: athen bigger moment mari lou retton quotnofault vault quot produc america first olymp allaround gold medal women
Index: 1503, Score: 3, Description: total gold medal award wednesday athen olymp day five medal alreadi award includ archeri boat shoot event well judo cycl weightlift
Index: 1972, Score: 3, Description: ethiopia meseret defar women meter gold medal olymp game athen august yearold women winner year world indoor championship budapes

In [9]:
# 2.2 test set
for query in queries:
    scores = []
    for i, doc in enumerate(test_df['description']):
        scores.append((i, compute_relevance_VSB(query, doc)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 5:")
    for i, score in scores[:5]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print("Bottom 5:")
    for i, score in scores[-5:]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 5:
Index: 138, Score: 3, Description: sheik ahm bin hashr almaktoum earn firstev olymp medal unit arab emir took home gold medal men doubl trap shoot tuesday athen
Index: 158, Score: 3, Description: athen reuter weari michael phelp target fourth olymp gold medal athen turn attent wednesday meter individu medley settl secondfastest overal time heat
Index: 177, Score: 3, Description: athen dutch cycl great leontien zijlaardvan moorsel emot defend olymp time trial gold medal
Index: 179, Score: 3, Description: amsterdam cyclist leontien zijlaardvan moorsel first gold medal netherland athen olymp game wednesday
Index: 197, Score: 3, Description: leontien zijlaardvan moorsel netherland wipe tear win gold medal women road cycl individu time trial vouliagmeni olymp centr athen wednesday
Bottom 5:
Index: 7595, Score: 0, Description: ukrainian presidenti candid viktor yushchenko poison harm known dioxin contain agent orang scientist analyz blood said friday
Index: 759

In [10]:
# 3 Document Relevance with Vector Space TF-IDF Model (40 points)

In [11]:
def compute_relevance_TF_IDF(query, doc, docs, N, avgdl, doc_freq, k=1.2, b=0.75, normalization=False):
    wds = doc.split()
    
    score = 0
    for w in query.split():
        freq = wds.count(w)
        tf = (freq * (k + 1)) / (freq + k * (1 - b + b * len(wds) / avgdl))
        f = doc_freq[w]
        idf = np.log((N + 1) / (f + 1))
        score += tf * idf

    if normalization:
        doc_length = len(doc.split())
        if doc_length > 0:
            score /= doc_length
        else:
            score = 0
    return score

In [12]:
# 3.1 train set
N = len(df['description'])
avgdl = sum([len(d.split()) for d in df['description']]) / N

for query in queries:
    scores = []
    for i, doc in enumerate(df['description']):
        scores.append((i, compute_relevance_TF_IDF(query, doc, df['description'], N, avgdl, doc_freq)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 10:")
    for i, score in scores[:10]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print("Bottom 10:")
    for i, score in scores[-10:]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 10:
Index: 2667, Score: 16.324211881317666, Description: athen greec olymp supremo jacqu rogg today offer gold medal round athen organis superb olymp year doom gloom lead open
Index: 10408, Score: 14.791925866155143, Description: athen aug xinhuanet american carli patterson runnerup world took women individu allaround gymnast gold medal point athen olymp friday even
Index: 2807, Score: 14.274166801251479, Description: part bsampl show jump hors ireland gold medal athen olymp stolen intern equestrian feder said yesterday
Index: 5501, Score: 14.061825139435866, Description: athen natasa janic hungari captur women k kayak gold medal team katalin kovac win k deni birgit fischer recordequal ninth career olymp gold
Index: 66, Score: 13.957465971163131, Description: ireland look certain strip gold medal athen olymp nation popular hors waterford crystal test posit ban drug
Index: 829, Score: 13.957465971163131, Description: athen bigger moment mari lou retton quotno

In [13]:
# 3.2 test set
N = len(test_df['description'])
avgdl = sum([len(d.split()) for d in test_df['description']]) / N

for query in queries:
    scores = []
    for i, doc in enumerate(test_df['description']):
        scores.append((i, compute_relevance_TF_IDF(query, doc, test_df['description'], N, avgdl, test_doc_freq)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 5:")
    for i, score in scores[:5]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print("Bottom 5:")
    for i, score in scores[-5:]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 5:
Index: 260, Score: 15.781793527929878, Description: athen reuter aaron peirsol second gold medal athen olymp thursday win appeal disqualif men meter backstrok
Index: 334, Score: 15.682551118041918, Description: tang gonghong lift world record claim athen th olymp gold china sinc particip olymp game saturday
Index: 179, Score: 15.196573608999902, Description: amsterdam cyclist leontien zijlaardvan moorsel first gold medal netherland athen olymp game wednesday
Index: 177, Score: 14.834774860079579, Description: athen dutch cycl great leontien zijlaardvan moorsel emot defend olymp time trial gold medal
Index: 2393, Score: 14.834774860079579, Description: american cyclist tyler hamilton keep gold medal athen olymp test lab mishandl blood sampl
Bottom 5:
Index: 7595, Score: 0.0, Description: ukrainian presidenti candid viktor yushchenko poison harm known dioxin contain agent orang scientist analyz blood said friday
Index: 7596, Score: 0.0, Description: suppli 

In [14]:
# 4 Document Relevance with Word2Vec (20 points)

In [15]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

tokenized_descriptions = [nltk.word_tokenize(text) for text in df['description']]
model = Word2Vec(sentences=tokenized_descriptions, vector_size=100, window=5, min_count=1, workers=4)

In [16]:
from string import punctuation as punct
from scipy.spatial import distance

def cos_similarity(word, doc, model):
    if word not in model.wv:
        return 0
    target_vec = model.wv[word]
    scores = []

    for wd in doc.split():
        if wd in model.wv:
            wd_vec = model.wv[wd]
            scores.append(1 - distance.cosine(target_vec, wd_vec))
        else:
            scores.append(0)

    return np.mean(scores)

def compute_relevance_Word2Vec(query, doc, model, normalization=False):
    score = 0
    for wd in query.split():
        score += cos_similarity(wd, doc, model)
    if normalization:
        doc_length = len(doc.split())
        if doc_length > 0:
            score /= doc_length
        else:
            score = 0
    return score

In [17]:
# 4.1 train set
for query in queries:
    scores = []
    for i, doc in enumerate(df['description']):
        scores.append((i, compute_relevance_Word2Vec(query, doc, model)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 10:")
    for i, score in scores[:10]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print("Bottom 10:")
    for i, score in scores[-10:]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 10:
Index: 9849, Score: 2.9578118920326233, Description: cardin blast way anoth win dodger head dodger stadium lead nl playoff seri
Index: 8564, Score: 2.9571001099215612, Description: green bay wi brett favr threw three touchdown pass th consecut start lead green bay packer win st
Index: 10751, Score: 2.956276659454618, Description: matt leinart undef usc defeat ucla saturday play orang bowl bowl championship seri titl
Index: 11534, Score: 2.9554579754670462, Description: quarterback david green throw yard touchdown pass georgia victori georgia tech saturday
Index: 2937, Score: 2.9485160946846007, Description: everyth go astro way day score ton run schedul look favor bullpen start kick even start pitch consist roger clemen
Index: 10394, Score: 2.94712816293423, Description: astro charg playoff th consecut home victori cap incred turnaround beat colorado sunday
Index: 7570, Score: 2.9470824813842773, Description: st loui cardin houston astro go distanc decid

In [18]:
# 4.2 test set
for query in queries:
    scores = []
    for i, doc in enumerate(test_df['description']):
        scores.append((i, compute_relevance_Word2Vec(query, doc, model)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 5:")
    for i, score in scores[:5]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print("Bottom 5:")
    for i, score in scores[-5:]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 5:
Index: 5735, Score: 2.9673777265208106, Description: indianapoli peyton man complet pass yard threw five touchdown indianapoli colt beat houston texan
Index: 3031, Score: 2.949510125319163, Description: anaheim score three run eighth inning oakland reliev ralli victori clinch american leagu west titl
Index: 4215, Score: 2.9477919467857907, Description: ap biggest comeback postseason basebal histori began david ortiz one greatest day basebal histori
Index: 2803, Score: 2.9451296712671007, Description: steve william pick ball barri bond hit th home run thought hand piec histori
Index: 2389, Score: 2.9431718358626733, Description: watch miami might among colleg footbal elit one think orang bowl quit yet
Bottom 5:
Index: 7513, Score: 1.482425191823174, Description: scientist india work himalaya discov new speci monkey stocki shorttail brownhair creatur name macaca munzala arunach macaqu
Index: 805, Score: 1.469467681646347, Description: joestar write quotmand

In [19]:
# 5 Document Relevance with Vector Space (20 points)

In [20]:
# 5.1 Document Relevance with Vector Space Basic Model with Document Length Normalization

In [21]:
# 5.1.1 train set
for query in queries:
    scores = []
    for i, doc in enumerate(df['description']):
        scores.append((i, compute_relevance_VSB(query, doc, normalization=True)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 10:")
    for i, score in scores[:10]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print("Bottom 10:")
    for i, score in scores[-10:]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 10:
Index: 4753, Score: 0.2222222222222222, Description: athen unfortun unit state olymp box tournament gone expect
Index: 5786, Score: 0.2, Description: nesterenko belarusian never broken second olymp four round gold second
Index: 2807, Score: 0.1875, Description: part bsampl show jump hors ireland gold medal athen olymp stolen intern equestrian feder said yesterday
Index: 66, Score: 0.17647058823529413, Description: ireland look certain strip gold medal athen olymp nation popular hors waterford crystal test posit ban drug
Index: 829, Score: 0.17647058823529413, Description: athen bigger moment mari lou retton quotnofault vault quot produc america first olymp allaround gold medal women
Index: 7413, Score: 0.17647058823529413, Description: fourtim olymp gold medallist matthew pinsent morn announc retir row three month final triumph athen coxless four
Index: 5115, Score: 0.15789473684210525, Description: yler hamilton bobbi julich show depth american cycl win

In [22]:
# 5.1.2 test set
for query in queries:
    scores = []
    for i, doc in enumerate(test_df['description']):
        scores.append((i, compute_relevance_VSB(query, doc)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 5:")
    for i, score in scores[:5]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print("Bottom 5:")
    for i, score in scores[-5:]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 5:
Index: 138, Score: 3, Description: sheik ahm bin hashr almaktoum earn firstev olymp medal unit arab emir took home gold medal men doubl trap shoot tuesday athen
Index: 158, Score: 3, Description: athen reuter weari michael phelp target fourth olymp gold medal athen turn attent wednesday meter individu medley settl secondfastest overal time heat
Index: 177, Score: 3, Description: athen dutch cycl great leontien zijlaardvan moorsel emot defend olymp time trial gold medal
Index: 179, Score: 3, Description: amsterdam cyclist leontien zijlaardvan moorsel first gold medal netherland athen olymp game wednesday
Index: 197, Score: 3, Description: leontien zijlaardvan moorsel netherland wipe tear win gold medal women road cycl individu time trial vouliagmeni olymp centr athen wednesday
Bottom 5:
Index: 7595, Score: 0, Description: ukrainian presidenti candid viktor yushchenko poison harm known dioxin contain agent orang scientist analyz blood said friday
Index: 759

In [23]:
# 5.2 Document Relevance with Vector Space TF-IDF Model with Document Length Normalization

In [24]:
# 5.2.1 train set
N = len(df['description'])
avgdl = sum([len(d.split()) for d in df['description']]) / N

for query in queries:
    scores = []
    for i, doc in enumerate(df['description']):
        scores.append((i, compute_relevance_TF_IDF(query, doc, df['description'], N, avgdl, doc_freq, normalization=True)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 10:")
    for i, score in scores[:10]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print("Bottom 10:")
    for i, score in scores[-10:]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 10:
Index: 4753, Score: 1.2273888749774444, Description: athen unfortun unit state olymp box tournament gone expect
Index: 5786, Score: 1.0857851139636747, Description: nesterenko belarusian never broken second olymp four round gold second
Index: 2807, Score: 0.8921354250782174, Description: part bsampl show jump hors ireland gold medal athen olymp stolen intern equestrian feder said yesterday
Index: 66, Score: 0.8210274100684195, Description: ireland look certain strip gold medal athen olymp nation popular hors waterford crystal test posit ban drug
Index: 829, Score: 0.8210274100684195, Description: athen bigger moment mari lou retton quotnofault vault quot produc america first olymp allaround gold medal women
Index: 7413, Score: 0.8210274100684195, Description: fourtim olymp gold medallist matthew pinsent morn announc retir row three month final triumph athen coxless four
Index: 2667, Score: 0.8162105940658833, Description: athen greec olymp supremo jacqu 

In [25]:
# 5.2.2 test set
N = len(test_df['description'])
avgdl = sum([len(d.split()) for d in test_df['description']]) / N

for query in queries:
    scores = []
    for i, doc in enumerate(test_df['description']):
        scores.append((i, compute_relevance_TF_IDF(query, doc, test_df['description'], N, avgdl, test_doc_freq, normalization=True)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 5:")
    for i, score in scores[:5]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print("Bottom 5:")
    for i, score in scores[-5:]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 5:
Index: 81, Score: 1.401265537708519, Description: athen reuter beach volleybal olymp sellout footstomp success
Index: 179, Score: 1.1689672006923002, Description: amsterdam cyclist leontien zijlaardvan moorsel first gold medal netherland athen olymp game wednesday
Index: 126, Score: 1.062016179330343, Description: canadian husband love wife led tighten secur olymp venu athen
Index: 177, Score: 1.05962677571997, Description: athen dutch cycl great leontien zijlaardvan moorsel emot defend olymp time trial gold medal
Index: 2393, Score: 1.05962677571997, Description: american cyclist tyler hamilton keep gold medal athen olymp test lab mishandl blood sampl
Bottom 5:
Index: 7595, Score: 0.0, Description: ukrainian presidenti candid viktor yushchenko poison harm known dioxin contain agent orang scientist analyz blood said friday
Index: 7596, Score: 0.0, Description: suppli attract pitch option dwindl daili lost pedro martinez met miss tim hudson resign randi jo

In [26]:
# 5.3 Document Relevance with Word2Vec with Document Length Normalization

In [27]:
# 5.3.1 train set
for query in queries:
    scores = []
    for i, doc in enumerate(df['description']):
        scores.append((i, compute_relevance_Word2Vec(query, doc, model, normalization=True)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 10:")
    for i, score in scores[:10]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print("Bottom 10:")
    for i, score in scores[-10:]:
        print(f"Index: {i}, Score: {score}, Description: {df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 10:
Index: 6828, Score: 0.8900306291050382, Description: ltstronggtopinionltstronggt seed disast
Index: 11562, Score: 0.8836497068405151, Description: reuter unpreced departur
Index: 367, Score: 0.7181598879396915, Description: ltstronggtcashncarrionltstronggt get em sunday
Index: 1846, Score: 0.7039507292211056, Description: ltstronggtanalysisltstronggt two year question
Index: 2023, Score: 0.644419863820076, Description: associ press ted bridi
Index: 7211, Score: 0.6368850506842136, Description: modest propos orlowski realli
Index: 3946, Score: 0.6360215619206429, Description: ltstronggtstorag decisionsltstronggt lord code
Index: 9218, Score: 0.5819442105293273, Description: relationship work leav anoth browser
Index: 2364, Score: 0.5758741354942322, Description: lawsuit doom stolen patent predict
Index: 6781, Score: 0.5731010874733329, Description: galileo threaten american live
Bottom 10:
Index: 4363, Score: 0.041626535072458656, Description: sinc launch

In [28]:
# 5.3.2 test set
for query in queries:
    scores = []
    for i, doc in enumerate(test_df['description']):
        scores.append((i, compute_relevance_Word2Vec(query, doc, model, normalization=True)))
    scores.sort(reverse=True, key=lambda x: x[1])
    print(f"Query: {query}")
    print("Top 5:")
    for i, score in scores[:5]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print("Bottom 5:")
    for i, score in scores[-5:]:
        print(f"Index: {i}, Score: {score}, Description: {test_df['description'].iloc[i]}")
    print('\n')

Query: olymp gold athen
Top 5:
Index: 592, Score: 0.7174954488873482, Description: associ press curt anderson
Index: 3985, Score: 0.7049224749207497, Description: sign delay manag expect
Index: 4174, Score: 0.6682869717478752, Description: reuter revolution physic made
Index: 1104, Score: 0.6632675044238567, Description: ltstronggtopinionltstronggt impot ipod pride
Index: 1797, Score: 0.5335991196334362, Description: associ press robert wielaard
Bottom 5:
Index: 5129, Score: 0.03428537845405185, Description: palmoneupgrad treo faster chip better display unit ship treo one big smartphon success stori last week palmon introduc followon treo higher resolut bypixel tft screen compani claim increas visibl area display make pictur document much clearer also carri remov batteri mb flash memori faster mhz intel xscale processor improv multimedia featur includ builtin mp player digit camera improv lowlight capabl well video captur playback function product expect ship year end carrier add servi