## 參考資料:
## https://docs.mongodb.org/manual/reference/sql-comparison/
## no-sql的mongodb的指令與sql不同,
## 比如"db.person.find()"等於sql的"select * from person"
## (所以mongodb查詢age=20的語法長什麼樣子?)
## 請先看過參考資料的語法對照,
## 了解新增修改刪除查詢的語法之後才看得懂pymongo¶
## 注意key:value當中,key是英文大小寫分開的,
## name與NAME會被視作不同的key(欄位)

In [None]:
import pymongo
from pymongo import MongoClient

#uri = "mongodb://USERNAME:password@host?authSource=source" #需要帳號權限登入才用
#client = MongoClient(uri)

client = MongoClient() #預設連接自己主機上的mongodb
database = client["test"] # SQL: Database Name
collection = database["person"]   # SQL: Table Name

record = {"name":"python","age":20}
collection.insert_one(record) #新增一筆資料進去

In [None]:
for post in collection.find(): #collection = person,這等於select * from person
    print post

print "----------"
obj = collection.find()
print type(obj) #傳回的物件型態, 可當成list取用
print "----------"
print obj[1]
print "----------"
print obj[3]["name"],obj[3]["age"] #從list中取出第3本字典查詢
print "----------"
print obj["name"] #注意傳回物件不是字典,不能用這種方式查詢
#想查所有name不是這樣寫的!!

In [None]:
for post in collection.find(): #查詢所有name的正確寫法!!
    print post["name"], post["age"]

In [None]:
collection.insert_many(  #insert 3份字典
        [{"name":"dabe","age":40},
         {"name":"bobe","age":32},
         {"name":"san","age":20}]
)

for post in collection.find(): #collection = person,這等於select * from person
    print post

In [None]:
collection.save({"_id":'57037a853f3dfa22a841c688',"name":"python"})
#save()是sql的update, 也可以用replace_one(),指定資料可以用主鍵

for post in collection.find(): #collection = person,這等於select * from person
    print post

In [None]:
for post in collection.find({"age":20},{"_id":0,"name":1}):
    print post
#find()可填入2本字典, 第一本是查詢條件, 第二本用來決定顯示欄位(0/1)
#這裡是查詢age = 20的所有資料, _id欄位不顯示, name欄位顯示
#sql寫法是select name from person where age = 20

In [None]:
for post in collection.find(
    {"age":{"$gt":20}}, #查詢條件age > 20 用$gt表示大於
    {"_id":0,"name":1,"age":1}).sort("age",-1):
    #sort()填入2個參數(排序欄位,正/負向排序)
    print post

#$字串用法:
#equal to, 等於, 沒有相等的用法, 指定age=20就寫成{"age":20}
#great than, 大於$gt 比如{"age":{"$gt":20}}
#great than or equal to, 大於等於$gte
#less than, 小於$lt
#less than or equal to, 小於等於$lte
#not equal to, 不等於$ne
#---------------------------------
#比對字串用以下兩個
#注意這是"精確查詢" (Precise Search)
#in, 包含某字串$in
#not in, 不包含某字串$nin

In [None]:
client = MongoClient()
database = client["test"]
collection = database["person"]

for post in collection.find(
    {"age":{"$gt":20},
    "name":{"$nin":["bobe","dabe"]}}, #查詢條件age > 20且名字不是bobe
    {"_id":0,"name":1,"age":1}).sort("age",-1):
    print post

## 接著來試把一個test.json檔案塞進mongodb,
## 這個檔案內有12篇新聞的資料,每一篇都有這些key:title,content,date,url 也就是一篇新聞的資料就是一個document(在python上稱做dictionary), 這個json檔案是一份collection(在python上被當成一份list), 格式大略如下[{新聞1},{新聞2},{新聞3},...] 其中的{新聞} = {"title":value1,"content":value2,"date":value3,...} 因為是多份document,所以用insert_many()

In [None]:
import pymongo
from pymongo import MongoClient
import json

with open('E:/mondb/test.json', 'r') as f:
    i = f.read()
    #json1 = json.loads(i)['url'],json.loads(i)['title'],json.loads(i)['date'],json.loads(i)['content']
    json = json.loads(i)
    f.close()
    print len(json)
    
client = MongoClient() 
database = client["test"]
collection = database["news"]
collection.insert_many(
    json
)

## 模糊查詢(Fuzzy Search)
在cmd上對mongodb做模糊查詢可以用{key:/關建詞/}的寫法來完成, 比如db.collection.find({"content":/民眾/},{"_id":0}) 但是pymongo不支援這種方法, 所以我們需要正規表達式來實現模糊查詢

In [None]:
#for post in collection.find({"content":/民眾/},{"_id":0}): 
#別掙扎了, pymongo沒有這種寫法, 這是直接對著mongodb終端機視窗的時候才有用的
#乖乖學正規表達法吧!

count = 0
tax = "民眾"


for post in collection.find({"content":{"$regex":tax}},{"_id":0}): 
    #這才是pymongo做模糊查詢的正確寫法
    print "--分類--"
    print post["category"]
    print "--報社--"
    print post["comp"]
    print "--新聞日期--"
    print post["date"]
    print "--標題--"
    print post["title"]
    print "--內文--"
    print post["content"]
    print "--連結--"
    print post["url"]
    print "----------------"
    count +=1
print "----------------"
print "搜尋出"+str(count)+"篇包含 "+tax+" 的新聞"

## 多個條件查詢

In [None]:
from pymongo import MongoClient  #logical query selector

client = MongoClient('10.120.28.20', 27017)
database = client["test"]
collection = database["news"]

count = 0
tag1 = "王建民"
tag2 = "滾地球"
tag3 = "皇家"

#查詢滿足多重條件的語法是{"$and":[{條件1},{條件2},{條件3},...]}
#另有$or, $not, $nor
#$all在pymongo已廢止, 因為與$and功能重複

for post in collection.find(
    {"$and":[                   
            {"content":{"$regex":tag1}},
            {"content":{"$regex":tag2}},
            {"content":{"$regex":tag3}},
            {"date":{"$regex":"2016"}}
            ]},{"_id":0}): 
    
    print "--分類--"
    print post["category"]
    print "--報社--"
    print post["comp"]
    print "--新聞日期--"
    print post["date"]
    print "--標題--"
    print post["title"]
    print "--內文--"
    print post["content"]
    print "--連結--"
    print post["url"]
    print "----------------"
    count +=1
print "----------------"
print "搜尋出"+str(count)+"篇包含 "+tag1+", "+tag2+", "+tag3+","+" 於2016年的新聞"

client.close()

In [None]:
#計算查詢結果篇數可以用find().count(), 省去自己計算count的程序

tag1 = "王建民"
tag2 = "滾地球"
tag3 = "皇家"

print collection.find(
    {"$and":[                   
            {"content":{"$regex":tag1}},
            {"content":{"$regex":tag2}},
            {"content":{"$regex":tag3}},
            {"date":{"$regex":"2016"}}
            ]},{"_id":0}).count()

## 一次掃描資料夾所有檔案存入mongo資料庫

In [2]:
import pymongo                                    #倒資料
from pymongo import MongoClient
import json
import os

def loadJson(path):                               #定義從硬碟讀進json檔案的方法
    with open(path, 'r') as f:                    #開檔
        i = f.read()                              #讀到記憶體
        jf = json.loads(i)                        #套用json格式並存到變數jf
        f.close()                                 #關檔
        print "已讀取 "+str(len(jf))+" 筆新聞資料"  #計算json檔案內的資料筆數
        return jf, len(jf)                        #傳回放在jf變數的json檔案與資料筆數
    

def insertToMongo(json):                          #定義新增json檔案到mongodb的方法
    client = MongoClient('10.120.28.12', 27017)   #輸入mongodb主機IP與port,連自己主機可以空白,
    database = client["test"]                     #[]填入自己設定的db name
    collection = database["test2"]                 #[]填入自己設定的collection name
    collection.insert(                       #新增資料進去，可能會因為版本不同要改成insert()而不是insert_many()
        json
    )
    client.close()
    

#path = ""
syspath = "E:/project_data/111" 
count = 0
total = 0

for dirPath, dirNames, fileNames in os.walk(syspath):    #列出syspath下的所有檔案
    for f in fileNames:
        #print os.path.join(dirPath, f)                 
        path = os.path.join(dirPath, f)                  #組合出每一個檔案的完整路徑
        
        jfile, count = loadJson(path)                    #將路徑傳入loadJson方法, 取回檔案內容
        insertToMongo(jfile)                       #將檔案內容傳入insertToMongo方法,把檔案內容存到mongodb
        total += count
        
print "共存入 "+str(total)+"筆資料"                #算出總共有多少篇新聞資料

with open("E:/project_data/readme1.txt", 'a') as f:     #計算結果存到readme.txt
    s = syspath +"下共有 "+ str(total) +" 筆新聞資料"+"\n"
    f.write(s)
    f.close()

已讀取 7648 筆新聞資料
已讀取 6952 筆新聞資料
已讀取 7506 筆新聞資料
共存入 22106筆資料


## 使用Group

In [12]:
#從查回結果分析同一天有幾篇

from pymongo import command_cursor
from pymongo import MongoClient  #logical query selector

client = MongoClient('10.120.28.12', 27017)
database = client["test"]
collection = database["test2"]


tag1 = "王建民"
tag2 = "滾地球"
tag3 = "皇家"

resp_list = collection.find(     # 這是對照組
    {"$and":[                   
            {"content":{"$regex":tag1}},
            {"content":{"$regex":tag2}},
            {"content":{"$regex":tag3}},
            ]},{"_id":0})
client.close()

print "總共傳回",resp_list.count(),"筆資料"
#print [date["date"] for date in resp_list]  # 此行印出每筆資料的date


#sql的group by在mongoDB用aggregate()來替代, 用法可以很複雜
#aggregate([{字典1},{字典2},{字典3},...])
#由於find被aggregate取代了, 所以查詢條件必須寫在$match:這本字典內
#{字典1} = {"$match:{querry的條件,這裡面跟find()裡面的用法一樣}"} 
#{字典2} = {"$group":{"_id":要做group by的欄位, "新增的欄位名":{"$sum":1}}}
#{字典3} = {其他函數} 這本用來寫如$sort, $limit等函數

date_list = collection.aggregate([
        {"$match":                                        #$match:設定查詢條件的字典
          {"$and":[                   
            {"content":{"$regex":tag1}},
            {"content":{"$regex":tag2}},
            {"content":{"$regex":tag3}},
            ]}},
        {"$group":{"_id":"$date","count":{"$sum":1},"title":{"$first":"$title"}}},    #$group:設定group by的欄位, 計算筆數
        {"$sort":{"_id":1}}                               #查詢結果排序
        ])

day_count = 0
print "日期, 該日有幾筆資料"
#aggregate傳回的物件型態是command_cursor, 可以強制轉型list(command_cursor)

# 註解的方式是以疊代器(Iterator)的方式處理,
# 但python並沒有hasNext()函數而是pymongo.command_cursor自帶的alive, 記得要import才能用
# 使用此方法傳回最後一個值的時候會丟出StopIteration,得try-except處理掉
#while date_list.alive: 
#    try:
#        print date_list.next()["_id"],date_list.next()["count"]
#        print date_list.next()
#    except StopIteration as stop:
#        date_list.close()
#        break

for ele in date_list:   # 用普通的for處理
    day_count += ele["count"]
    print ele["_id"],ele["count"],ele["title"]
    
print "總共傳回", day_count,"筆資料" #驗算, 與對照組結果比對, 但有2014/05/這類不明意義的東西

總共傳回 11 筆資料
日期, 該日有幾筆資料
20160304 1 飆近150公里,1局1安1K無失分,建仔：很輕鬆
20160308 1 2局無失分,首奪中繼點,建仔滾地球王再現
20160313 1 飆151公里,伸卡王回來了,速球連發台灣囝仔,周末發光
20160316 1 建仔伸卡Ｋ老友,飆153公里,變活廣告
20160321 3 4絕技,建仔大躍進,「先發有望」伸卡犀利,球速153,K功強,夠穩定
20160325 1 建仔6球收工,完美中繼1局,回大聯盟利多
20160327 1 杜恩辛丟1分,建仔上大聯盟利多
20160328 1 建仔3局無失分首勝報到,飆速153公里
20160331 1 建仔連3戰無失分,36歲生日快樂,後天再戰響尾蛇
總共傳回 11 筆資料


## pymongo sort()
- .sort("date",1):

In [2]:
from pymongo import MongoClient  #logical query selector

client = MongoClient('10.120.28.20', 27017)
database = client["test"]
collection = database["news"]

count = 0
tag1 = "王建民"
tag2 = "滾地球"
tag3 = "皇家"

#查詢滿足多重條件的語法是{"$and":[{條件1},{條件2},{條件3},...]}
#另有$or, $not, $nor
#$all在pymongo已廢止, 因為與$and功能重複

for post in collection.find(
    {"$and":[                   
            {"content":{"$regex":tag1}},
            {"content":{"$regex":tag2}},
            {"content":{"$regex":tag3}},
            {"date":{"$regex":"2016"}}
            ]},{"_id":0}).sort("date",1):
    print post['date']

2016/02/
2016/03/
2016/03/
2016/03/
2016/03/
2016/03/
2016/03/
2016/03/
20160126
20160228
20160303
20160303
20160303
20160304
20160304
20160307
20160307
20160307
20160307
20160307
20160308
20160308
20160308
20160310
20160312
20160312
20160313
20160313
20160315
20160315
20160315
20160316
20160316
20160317
20160318
20160320
20160320
20160320
20160320
20160320
20160320
20160320
20160321
20160321
20160321
20160321
20160324
20160324
20160324
20160324
20160325
20160325
20160325
20160325
20160327
20160327
20160327
20160327
20160327
20160327
20160327
20160327
20160328
20160328
20160329
20160330
20160330
20160330
20160330
20160330
20160330
20160330
20160330
20160330
20160331
20160331
20160331


In [4]:
from pymongo import MongoClient  #logical query selector

client = MongoClient('10.120.28.12', 27017)
database = client["test"]
collection = database["news2"]
collection2 = database['ptt']

In [8]:
count = 0
tag1 = "王建民"
tag2 = "滾地球"
tag3 = "皇家"

#查詢滿足多重條件的語法是{"$and":[{條件1},{條件2},{條件3},...]}
#另有$or, $not, $nor
#$all在pymongo已廢止, 因為與$and功能重複

for post in collection.find(
    {"$and":[                   
            {"tfidf":{"$regex":tag1}},
            {"tfidf":{"$regex":tag2}},
            {"tfidf":{"$regex":tag3}},
            {"date":{"$regex":"2016"}}
            ]},{"_id":0}).sort("date",1):
    print post['date']
    
print "------------------------------------------"
for post in collection2.find(
    {"$and":[                   
            {"tfidf":{"$regex":tag1}},
            {"tfidf":{"$regex":tag2}},
            {"tfidf":{"$regex":tag3}},
            {"date":{"$regex":"2016"}}
            ]},{"_id":0}).sort("date",1):
    print post['pos'][0]

20160304
20160307
20160307
20160307
20160307
20160308
20160308
20160310
20160315
20160316
20160320
20160320
20160320
20160321
20160324
20160324
20160325
20160327
20160327
20160327
20160327
20160328
20160330
20160330
20160330
20160330
20160330
20160331
20160331
20160427
------------------------------------------
加油
讚
加油
猛將
不錯


IndexError: list index out of range