Skip to content
Permalink
Browse files

Merge master to devextra (#230)

* Added option for custom type

* Create automate.py

* Finished dataframes storing option (#224)

* Update (#174)

* add function to clean accumulated pandas storage data

* Fixed typo, dataname, removed attributes

* Added config options and config var

* Added autoclean

Works for search now

* Added Elasticsearch count options

* Added silent output and objects for users and followers

* Update

* Clean following/followers attr

* Final construct of object

* Redesign

* Little fix

* Debug

* Debug

* Globals

* Removed debug

* Globals pt 2

* Mix

* Added _old_obj to store previous scrape

* Prefix

* Pre fix pt 2

* commented

* Fix for object follow

* Update

* Update

* Completed follow_object

* Pandas object for followers and following

* Finished pandas object for followers and following

* Added docstrings in Twint.py

* Added lowercase

#170

* Finished lower case

Close #170

* Fix defaults

* Added some edits

In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings

* Update for panda and objects

* Finished storing data into dataframes #173

Now followers, following, tweets, and user details are saved in dataframes

* Added proxy support (#225)

* Added proxy #139

* Added new requirement, fixed proxy, added proxy config

* Changed index names, removed duplicate arg

* Updated default CLI args

* Added visualizations and dashboard

* Typo fix

* Added loggin options, fixes retweets

* Update README.md

Added examples and how-to

* Updated index and fixes

* Update

* Update dashboards

* Update

* Update index-tweets, fixed visualizations and new dashboard

* Update doc

* Fixed errors with user_full

* Fixed quite hidden issue

* Added print error

* Added other print error

* Update

* #173

* Fix non-latin chars #229
  • Loading branch information...
pielco11 committed Oct 1, 2018
1 parent 107f76d commit 22f343559a7b0ca278daaed7d2c209b4220a8641
@@ -1,4 +1,7 @@
# TWINT - Twitter Intelligence Tool
![2](https://i.imgur.com/iaH3s7z.png)
![3](https://i.imgur.com/hVeCrqL.png)

[![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/haccer/twint.svg?branch=master)](https://travis-ci.org/haccer/twint/) [![Python 3.5|3.6](https://img.shields.io/badge/Python-3.5%2F3.6-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE)

>No authentication. No API. No limits.
133 Twint.py
@@ -8,24 +8,27 @@
Licensed under MIT License
Copyright (c) 2018 Cody Zacharias
'''
import argparse
import twint
import sys
import os
import argparse
import twint

def error(error, message):
print("[-] {}: {}".format(error, message))
def error(_error, message):
""" Print errors to stdout
"""
print("[-] {}: {}".format(_error, message))
sys.exit(0)

def check(args):
# Error checking
""" Error checking
"""
if args.username is not None:
if args.verified:
error("Contradicting Args",
"Please use --verified in combination with -s.")
"Please use --verified in combination with -s.")
if args.userid:
error("Contradicting Args",
"--userid and -u cannot be used together.")
"--userid and -u cannot be used together.")
if args.output is None:
if args.csv:
error("Error", "Please specify an output file (Example: -o file.csv).")
@@ -40,44 +43,23 @@ def check(args):
if args.user_full:
error("Error", "Please use --user-full with --followers or --following.")

# Proxy stuff
if args.proxy_host is not None:
if args.proxy_host.lower() == "tor":
import socks, socket
socks.set_default_proxy(socks.SOCKS5, "localhost", 9050)
socket.socket = socks.socksocket
elif args.proxy_port and args.proxy_type:
if args.proxy_type.lower() == "socks5":
_type = socks.SOCKS5
elif args.proxy_type.lower() == "socks4":
_type = socks.SOCKS4
elif args.proxy_type.lower() == "http":
_type = socks.HTTP
else:
error("Error", "Proxy types allowed are: socks5, socks4, and http.")
import socks, socket
socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
socket.socket = socks.socksocket
else:
error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")
else:
if args.proxy_port or args.proxy_type:
error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")

def loadUserList(ul, type):
def loadUserList(ul, _type):
""" Concatenate users
"""
if os.path.exists(os.path.abspath(ul)):
userlist = open(os.path.abspath(ul), "r").read().splitlines()
else:
userlist = ul.split(",")
if type == "search":
if _type == "search":
un = ""
for user in userlist:
un += "%20OR%20from%3A" + user
return un[15:]
else:
return userlist
return userlist

def initialize(args):
""" Set default values for config from args
"""
c = twint.Config()
c.Username = args.username
c.User_id = args.userid
@@ -123,12 +105,15 @@ def initialize(args):
c.Media = args.media
c.Replies = args.replies
c.Pandas_clean = args.pandas_clean
c.ES_count = {"likes":True, "replies":True, "retweets":True}
return c

def options():
""" Parse arguments
"""
ap = argparse.ArgumentParser(prog="Twint.py",
usage="python3 %(prog)s [options]",
description="TWINT - An Advanced Twitter Scraping Tool.")
usage="python3 %(prog)s [options]",
description="TWINT - An Advanced Twitter Scraping Tool.")
ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
@@ -143,15 +128,16 @@ def options():
ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).")
ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
action="store_true")
action="store_true")
ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
ap.add_argument("--json", help="Write as .json file", action="store_true")
ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
ap.add_argument("--userid", help="Twitter user id.")
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true")
action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
action="store_true")
ap.add_argument("--hostname", help="Store the mysql database host")
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.")
ap.add_argument("--DB_user", help="Store the mysql database user")
@@ -164,59 +150,74 @@ def options():
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
ap.add_argument("--proxy-port", help="The port of the proxy server.")
ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.")
ap.add_argument("--essid",
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
nargs="?", default="")
ap.add_argument("--userlist", help="Userlist from list or file.")
ap.add_argument("--retweets", help="Include user's Retweets (Warning: limited).", action="store_true")
ap.add_argument("--retweets",
help="Include user's Retweets (Warning: limited).",
action="store_true")
ap.add_argument("--format", help="Custom output format (See wiki for details).")
ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).",
action="store_true")
ap.add_argument("--user-full",
help="Collect all user information (Use with followers or following only).",
action="store_true")
ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets (Including Retweets).",
action="store_true")
help="Slow, but effective method of collecting a user's Tweets and RT.",
action="store_true")
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)")
ap.add_argument("--search_name", help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-it", "--index-tweets", help="Custom Elasticsearch Index name for Tweets.")
ap.add_argument("-if", "--index-follow", help="Custom Elasticsearch Index name for Follows.")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.")
ap.add_argument("--debug", help="Store information in debug logs", action="store_true")
ap.add_argument("--pandas-type",
help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
ap.add_argument("--search_name",
help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-it", "--index-tweets",
help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
ap.add_argument("-if", "--index-follow",
help="Custom Elasticsearch Index name for Follows.",
nargs="?", default="twintgraph")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
nargs="?", default="twintuser")
ap.add_argument("--debug",
help="Store information in debug logs", action="store_true")
ap.add_argument("--resume", help="Resume from Tweet ID.")
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--media",
help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("-pc", "--pandas-clean",
help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("-ec", "--es-count", nargs="?", default="",
help="What NOT to count: likes, replies, retweets; only for Elasticsearch.")
args = ap.parse_args()

return args

def main():
""" Main
"""
args = options()
check(args)

if args.userlist:
args.username = loadUserList(args.userlist, "search")

if not args.pandas_type:
args.pandas_type = "HDF5"
if args.pandas_clean:
twint.storage.panda.clean()

if not args.index_tweets:
args.index_tweets = "twint"
c = initialize(args)

if not args.index_follow:
args.index_follow = "twintGraph"
if "likes" in str(args.es_count):
c.ES_count["likes"] = True

if not args.index_users:
args.index_users = "twintUser"
if "replies" in str(args.es_count):
c.ES_count["replies"] = True

if not args.essid:
args.essid = ""
if "retweets" in str(args.es_count):
c.ES_count["retweets"] = True

if args.pandas_clean:
twint.storage.panda.clean()

c = initialize(args)

if args.favorites:
if args.userlist:
_userlist = loadUserList(args.userlist, "favorites")
@@ -0,0 +1,65 @@
import twint
import schedule
import time

# you can change the name of each "job" after "def" if you'd like.
def jobone():
print ("Fetching Tweets")
c = twint.Config()
# choose username (optional)
c.Username = "insert username here"
# choose search term (optional)
c.Search = "insert search term here"
# choose beginning time (narrow results)
c.Since = "2018-01-01"
# set limit on total tweets
c.Limit = 1000
# no idea, but makes the csv format properly
c.Store_csv = True
# format of the csv
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
# change the name of the csv file
c.Output = "filename.csv"
twint.run.Search(c)

def jobtwo():
print ("Fetching Tweets")
c = twint.Config()
# choose username (optional)
c.Username = "insert username here"
# choose search term (optional)
c.Search = "insert search term here"
# choose beginning time (narrow results)
c.Since = "2018-01-01"
# set limit on total tweets
c.Limit = 1000
# no idea, but makes the csv format properly
c.Store_csv = True
# format of the csv
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
# change the name of the csv file
c.Output = "filename2.csv"
twint.run.Search(c)

# run once when you start the program

jobone()
jobtwo()

# run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)

# schedule.every(1).minutes.do(jobone)
schedule.every().hour.do(jobone)
# schedule.every().day.at("10:30").do(jobone)
# schedule.every().monday.do(jobone)
# schedule.every().wednesday.at("13:15").do(jobone)

# schedule.every(1).minutes.do(jobtwo)
schedule.every().hour.do(jobtwo)
# schedule.every().day.at("10:30").do(jobtwo)
# schedule.every().monday.do(jobtwo)
# schedule.every().wednesday.at("13:15").do(jobtwo)

while True:
schedule.run_pending()
time.sleep(1)
@@ -30,24 +30,55 @@ If you are not getting these outputs I suggest you to dig in the corresponding d

Now that everything is up and running:

1. Index some data: `python3.6 Twint.py --elasticsearch localhost:9200 -u user --database twint.db` (the `--database` arg is optional, `--elasticsearch` is mandatory and its value is for default settings, as in our case;
1. Index some data: `python3.6 Twint.py --elasticsearch localhost:9200 -u user` (in this case `--elasticsearch` is mandatory argument and its value is a host:port combination, where the Elasticsearch instance is binding to);

2. Now we can create the index (that I already created): open your browser and go to `http://localhost:5601` (again, this is a default value), `Dev Tools` tab, copy&paste `index-tweets.json` and than click the green arrow. Expected output is
2. Now we can create the index (that I already built): open your browser and go to `http://localhost:5601` (again, this is a default value), `Dev Tools` tab, copy&paste `index-tweets.json` and than click the green arrow. Expected output is

```json
{
"acknowledged": true,
"shards_acknowledged": true,
"index": "twint"
"index": "twinttweets"
}
```

3. Go to `Management` tab, `Index Patterns`, `Create Index Pattern`, `Index Pattern: twint` and choose `datestamp` as time field;

4. Go to the `Discover` tab, choose `twint` and you should see something like this:
4. Go to the `Discover` tab, choose `twinttweets` and you should see something like this:

![1](https://i.imgur.com/Ut9173J.png)

### Notes
PS: this screenshot has the index named `tweep`, you will see `twinttweets`

Different indexes can have different visualizations so there is not a general rule, with the basics provided in the Wiki you should be able to create visualizations. In any case, for every question, don't hesitate to ask.
### Query How-to
1. Filter out "multiplied" data and analyze only own tweets.
If, during the indexing phase, you specified the `--es-count` param you could have the need of filtering-out the counting of likes/retweets/replies, to achieve this in the `Search` bar type `NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies`;

2. Filter-out tweets for a specific username: `username: handle`, where `handle` is `@handle`;
3. Filter-out tweets for a specific user_id: `user_id: 0123456`;
4. Filter-out tweets for a specific word in the tweet: `tweet: osint`;
5. Define specific timestamp intervals: click on the clock in the top right corner;
6. Concatenate conditions: Lucene syntax has some logic built-in, operators like `AND` and `OR` are useful to restrict the data that you want to study;

[Here](https://www.elastic.co/guide/en/kibana/current/lucene-query.html) a short article about Lucene Query Syntax.

### Examples
Search for every tweet from "@John" and "@Janet":
`username: John AND username: Janet`

Search for tweets from "myearthquakeapp" and restrict the result for earthquakes with magnitude between 5.0 and 5.9:
`username: myearthquakeapp AND tweet: 5.?`

Search for tweets with at least 5 likes:
`nlikes: [5 TO *]` and similarly tweets with at least 1 like but less than 10 `nlikes: [1 TO 10]` (`[]` extremes included, `{}` extremes excluded)

### Ready-to-Use Visualizations
With the newest versions of Kibana users can export objects, for example, but not limited to, visualizations and dashboards.

Making visualizations is a simple but not easy process, you have to combine how you want to index data and how you want to visualize it.

To help you getting started with Twint and Elasticsearch, I made some basic visualization and a dashboard. To use them you have just to import them: go to `Management` tab (the gear), `Saved Objects`, `Import` and then select `visualizations.json`, repeat the process for `dashboard.json`.
After this just to go `Dashboard` tab and click on `Twint Dashboard`.

![2](https://i.imgur.com/iaH3s7z.png)
![3](https://i.imgur.com/hVeCrqL.png)
@@ -0,0 +1,18 @@
[
{
"_id": "e6d65380-bfe2-11e8-961a-d371b24d5d1d",
"_type": "dashboard",
"_source": {
"title": "Twint Dashboard",
"hits": 0,
"description": "",
"panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":40,\"h\":17,\"i\":\"1\"},\"embeddableConfig\":{},\"id\":\"d47421c0-bfd5-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":40,\"y\":6,\"w\":8,\"h\":11,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"e2b89640-bfd4-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":32,\"w\":20,\"h\":17,\"i\":\"3\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"8a8bb420-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":17,\"w\":33,\"h\":15,\"i\":\"4\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"a8d3ee70-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":40,\"y\":0,\"w\":8,\"h\":6,\"i\":\"6\"},\"embeddableConfig\":{},\"id\":\"37cd72e0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":33,\"y\":17,\"w\":15,\"h\":15,\"i\":\"7\"},\"embeddableConfig\":{},\"id\":\"149ecbc0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":20,\"y\":32,\"w\":28,\"h\":17,\"i\":\"8\"},\"version\":\"6.3.2\",\"type\":\"visualization\",\"id\":\"b45ec590-c267-11e8-bcd4-3956fe930db7\",\"embeddableConfig\":{}}]",
"optionsJSON": "{\"darkTheme\":true,\"hidePanelTitles\":true,\"useMargins\":true}",
"version": 1,
"timeRestore": false,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}"
}
}
}
]
@@ -1,4 +1,4 @@
PUT twintGraph
PUT twintgraph
{
"mappings": {
"items": {

0 comments on commit 22f3435

Please sign in to comment.
You can’t perform that action at this time.