In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
news = pd.read_csv('noticias_estadao.csv', sep=',', index_col='idNoticia')

In [4]:
news = news.sort_index(ascending=True)

# transforma o texto de cada celula da tabela, em uma lista das palavras do texto, em letras minusculas.

In [5]:
for i in range(1, len(news) + 1):
    for j in range(len(news.ix[i])):
        news.ix[i,j] = list(set(news.ix[i,j].lower().split(" ")))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


# cria um índice invertido das palavras dos documentos, onde associa cada palavra aos documentos que ela se encontra.

In [6]:
reference_list = {}

for i in range(1, len(news) + 1):
    for j in range(len(news.ix[i])):
        for k in news.ix[i][j]:
            if(reference_list.setdefault(k, -1) == -1):
                reference_list[k] = set()
                
            reference_list[k].add(i)

# FUNÇÕS DE BUSCA

In [7]:
def search_one_term(word):
    return list(reference_list[word])

In [8]:
def search_or_two_terms(word1, word2):
    return list(np.union1d(list(reference_list[word1]), list(reference_list[word2])))

In [9]:
def smaller_terms(words):
    smaller = 0
    
    for i in range(1, len(words)):
            if (len(reference_list[words[smaller]]) > len(reference_list[words[i]])): 
                    smaller = i # posição da palavra com menos documentos
                    
    words[0], words[smaller] = words[smaller], words[0]
    #coloca o termo presente em menos documentos na posição inicial da lista
                    
    return None # função com efeito colateral

In [10]:
def search_and_n_terms(words):
    if (len(words) > 2):
        smaller = smaller_terms(words)
            
    result = reference_list[words[0]]
    
    for i in range(1, len(words)):
                result = np.intersect1d(list(result), list(reference_list[words[i]]))
                # result é o resultado de sucessivas buscas AND das palavras
                
    return list(result)

# função que trata e identifica a entrada, para selecionar a função de busca

In [11]:
def search(terms):
    words = terms.lower().split(" ")
    
    if(len(words) == 1):
        return search_one_term(words[0])
        
    elif(words[1] == 'or'):
        return search_or_two_terms(words[0], words[2])
    
    elif(words[1] == 'and'):
            return search_and_n_terms([words[0],words[2]])
        
    else:
        return search_and_n_terms(words)

# TESTES

In [12]:
search("debate OR presidencial")

[1,
 2,
 16,
 24,
 46,
 50,
 61,
 79,
 86,
 88,
 105,
 107,
 109,
 122,
 130,
 140,
 143,
 158,
 165,
 166,
 182,
 189,
 199,
 201,
 203,
 204,
 205,
 209,
 213,
 228,
 234,
 235,
 238,
 240,
 241,
 244,
 250,
 255,
 259,
 260,
 261,
 273,
 278,
 295,
 311,
 327,
 334,
 336,
 343,
 345,
 348,
 353,
 355,
 357,
 363,
 368,
 369,
 374,
 378,
 383,
 389,
 392,
 396,
 399,
 400,
 413,
 416,
 417,
 419,
 452,
 453,
 461,
 463,
 471,
 472,
 475,
 478,
 484,
 487,
 491,
 542,
 543,
 547,
 548,
 549,
 553,
 560,
 594,
 613,
 614,
 624,
 632,
 634,
 637,
 639,
 640,
 641,
 649,
 653,
 661,
 668,
 670,
 673,
 679,
 683,
 684,
 685,
 697,
 699,
 701,
 703,
 704,
 713,
 714,
 717,
 718,
 722,
 723,
 740,
 745,
 750,
 752,
 753,
 756,
 760,
 761,
 763,
 764,
 768,
 776,
 777,
 779,
 789,
 793,
 803,
 805,
 806,
 807,
 808,
 809,
 811,
 812,
 814,
 817,
 823,
 826,
 827,
 828,
 840,
 842,
 843,
 848,
 857,
 858,
 859,
 862,
 867,
 869,
 872,
 873,
 875,
 877,
 882,
 883,
 890,
 892,
 893,
 896,
 897

In [13]:
assert len(search("debate OR presidencial")) == 1770

In [14]:
search("debate AND presidencial")

[24,
 166,
 234,
 255,
 327,
 637,
 703,
 704,
 713,
 722,
 776,
 779,
 805,
 814,
 883,
 982,
 1005,
 1013,
 1018,
 1038,
 1043,
 1058,
 1069,
 1102,
 1111,
 1131,
 1132,
 1138,
 1140,
 1151,
 1155,
 1158,
 1180,
 1197,
 1291,
 1313,
 1315,
 1320,
 1325,
 1326,
 1339,
 1348,
 1362,
 1367,
 1374,
 1379,
 1391,
 1394,
 1399,
 1404,
 1406,
 1433,
 1446,
 1460,
 1465,
 1489,
 1507,
 1508,
 1523,
 1571,
 1586,
 1588,
 1600,
 1647,
 1658,
 1672,
 1767,
 1811,
 1816,
 1844,
 1845,
 1855,
 1864,
 1873,
 1939,
 2026,
 2033,
 2054,
 2068,
 2069,
 2092,
 2107,
 2108,
 2176,
 2184,
 2211,
 2224,
 2255,
 2266,
 2313,
 2338,
 2388,
 2449,
 2469,
 2489,
 2521,
 2548,
 2574,
 2608,
 2669,
 2705,
 2711,
 2774,
 2853,
 2880,
 2911,
 3127,
 3206,
 3219,
 3235,
 3236,
 3242,
 3251,
 3252,
 3268,
 3322,
 3367,
 3450,
 3454,
 3469,
 3534,
 3592,
 3611,
 3689,
 3792,
 3793,
 3805,
 3806,
 3807,
 3811,
 3830,
 3844,
 3860,
 3874,
 3887,
 3916,
 4134,
 4261,
 4318,
 4442,
 4460,
 4504,
 4505,
 4615,
 4619,
 4

In [15]:
assert len(search("debate AND presidencial")) == 201

In [16]:
search("presidenciáveis OR corruptos")

[68,
 93,
 126,
 149,
 160,
 176,
 180,
 272,
 304,
 330,
 375,
 422,
 426,
 430,
 456,
 497,
 523,
 538,
 539,
 777,
 789,
 841,
 874,
 893,
 1109,
 1129,
 1158,
 1198,
 1325,
 1326,
 1343,
 1428,
 1461,
 1470,
 1481,
 1487,
 1537,
 1639,
 1693,
 1847,
 1851,
 1859,
 2023,
 2024,
 2026,
 2028,
 2030,
 2051,
 2068,
 2069,
 2080,
 2088,
 2093,
 2123,
 2125,
 2144,
 2149,
 2152,
 2225,
 2248,
 2253,
 2285,
 2442,
 2458,
 2471,
 2507,
 2516,
 2571,
 2574,
 2609,
 2628,
 2660,
 2669,
 2672,
 2676,
 2678,
 2686,
 2732,
 2740,
 2764,
 2813,
 2860,
 3042,
 3046,
 3182,
 3188,
 3217,
 3243,
 3248,
 3260,
 3377,
 3391,
 3397,
 3444,
 3446,
 3447,
 3466,
 3479,
 3489,
 3546,
 3607,
 3616,
 3633,
 3641,
 3664,
 3666,
 3684,
 3730,
 3747,
 3767,
 3786,
 3843,
 3860,
 3962,
 4025,
 4034,
 4042,
 4137,
 4170,
 4177,
 4184,
 4199,
 4219,
 4235,
 4258,
 4260,
 4294,
 4488,
 4492,
 4660,
 4677,
 4743,
 4847,
 4926,
 4965,
 5110,
 5115,
 5121,
 5133,
 5237,
 5271,
 5472,
 5587,
 5659,
 5713,
 5895,
 590

In [17]:
assert len(search("presidenciáveis OR corruptos")) == 164

In [18]:
search("presidenciáveis AND corruptos")

[]

In [19]:
assert len(search("presidenciáveis AND corruptos")) == 0

In [20]:
search("Belo OR Horizonte")

[4,
 13,
 48,
 74,
 75,
 196,
 206,
 211,
 304,
 332,
 374,
 624,
 634,
 643,
 661,
 999,
 1001,
 1013,
 1020,
 1033,
 1044,
 1046,
 1081,
 1086,
 1087,
 1088,
 1090,
 1094,
 1133,
 1142,
 1180,
 1183,
 1186,
 1230,
 1242,
 1264,
 1270,
 1280,
 1315,
 1318,
 1329,
 1331,
 1343,
 1362,
 1367,
 1381,
 1386,
 1388,
 1389,
 1405,
 1406,
 1424,
 1431,
 1434,
 1445,
 1506,
 1507,
 1508,
 1542,
 1560,
 1586,
 1598,
 1603,
 1605,
 1734,
 1762,
 1786,
 1797,
 1798,
 1823,
 1858,
 1861,
 1872,
 1877,
 1880,
 1883,
 1892,
 1909,
 1917,
 2002,
 2004,
 2011,
 2018,
 2031,
 2032,
 2040,
 2044,
 2061,
 2063,
 2064,
 2065,
 2074,
 2095,
 2098,
 2101,
 2127,
 2145,
 2164,
 2207,
 2213,
 2219,
 2233,
 2362,
 2363,
 2365,
 2381,
 2395,
 2483,
 2494,
 2576,
 2672,
 2673,
 2700,
 2710,
 2798,
 2805,
 2853,
 2873,
 2909,
 2944,
 2950,
 2951,
 3019,
 3029,
 3031,
 3069,
 3109,
 3110,
 3119,
 3129,
 3135,
 3138,
 3142,
 3143,
 3160,
 3170,
 3183,
 3186,
 3188,
 3214,
 3224,
 3225,
 3248,
 3262,
 3284,
 3288,


In [21]:
assert len(search("Belo OR Horizonte")) == 331

In [22]:
search("Belo AND Horizonte")

[4,
 13,
 48,
 74,
 75,
 206,
 332,
 624,
 634,
 1013,
 1020,
 1033,
 1046,
 1081,
 1086,
 1087,
 1088,
 1090,
 1094,
 1133,
 1142,
 1180,
 1186,
 1230,
 1242,
 1264,
 1270,
 1280,
 1315,
 1329,
 1331,
 1343,
 1362,
 1367,
 1381,
 1386,
 1388,
 1389,
 1405,
 1406,
 1424,
 1431,
 1434,
 1506,
 1507,
 1508,
 1598,
 1603,
 1605,
 1734,
 1762,
 1797,
 1798,
 1823,
 1858,
 1861,
 1872,
 1877,
 1880,
 1883,
 1892,
 1909,
 1917,
 2002,
 2004,
 2011,
 2018,
 2031,
 2040,
 2044,
 2061,
 2063,
 2064,
 2065,
 2074,
 2095,
 2098,
 2101,
 2127,
 2145,
 2164,
 2213,
 2219,
 2381,
 2395,
 2483,
 2494,
 2672,
 2673,
 2798,
 2909,
 2944,
 2950,
 2951,
 3031,
 3069,
 3119,
 3129,
 3135,
 3138,
 3142,
 3143,
 3160,
 3183,
 3186,
 3188,
 3224,
 3262,
 3316,
 3326,
 3413,
 3445,
 3522,
 3528,
 3531,
 3534,
 3593,
 3604,
 3626,
 3698,
 3700,
 3852,
 3860,
 3915,
 3923,
 3926,
 3964,
 3990,
 3998,
 4115,
 4116,
 4177,
 4184,
 4230,
 4234,
 4241,
 4245,
 4251,
 4273,
 4294,
 4333,
 4357,
 4361,
 4375,
 4398,


In [23]:
assert len(search("Belo AND Horizonte")) == 242

In [24]:
search("candidatos")

[51,
 73,
 77,
 84,
 123,
 127,
 142,
 143,
 154,
 161,
 167,
 189,
 191,
 263,
 268,
 276,
 311,
 343,
 374,
 375,
 377,
 383,
 399,
 417,
 418,
 444,
 483,
 484,
 604,
 605,
 616,
 623,
 628,
 632,
 633,
 640,
 647,
 651,
 662,
 688,
 717,
 718,
 746,
 747,
 752,
 761,
 764,
 777,
 783,
 790,
 791,
 792,
 793,
 795,
 796,
 801,
 804,
 813,
 825,
 827,
 828,
 829,
 844,
 859,
 870,
 871,
 876,
 890,
 982,
 985,
 1005,
 1011,
 1014,
 1016,
 1019,
 1020,
 1034,
 1040,
 1050,
 1053,
 1058,
 1059,
 1068,
 1075,
 1077,
 1086,
 1091,
 1096,
 1097,
 1099,
 1103,
 1106,
 1108,
 1112,
 1113,
 1115,
 1116,
 1117,
 1119,
 1123,
 1124,
 1128,
 1131,
 1134,
 1140,
 1143,
 1148,
 1158,
 1162,
 1163,
 1165,
 1166,
 1168,
 1170,
 1172,
 1174,
 1175,
 1178,
 1186,
 1189,
 1191,
 1193,
 1194,
 1195,
 1197,
 1198,
 1199,
 1200,
 1201,
 1202,
 1203,
 1208,
 1209,
 1210,
 1213,
 1214,
 1218,
 1228,
 1229,
 1230,
 1234,
 1236,
 1237,
 1244,
 1248,
 1251,
 1257,
 1265,
 1267,
 1269,
 1274,
 1277,
 1278,
 12

In [25]:
len (search("candidatos"))

1395

In [26]:
search("PT não pode se queixar afirma futuro articulador")

[11]

In [27]:
len(search("PT não pode se queixar afirma futuro articulador"))

1