Permalink
Browse files

Switch to native protocol driver, cql3

  • Loading branch information...
thobbs committed Feb 27, 2014
1 parent 50c5b7c commit 678abbef627ce52b025b4ef88779765005d19439
View
250 cass.py
@@ -1,23 +1,16 @@
import time
from datetime import datetime
from uuid import uuid1, UUID
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
from pycassa.cassandra.ttypes import NotFoundException
from cassandra.cluster import Cluster
__all__ = ['get_user_by_username', 'get_friend_usernames',
'get_follower_usernames', 'get_users_for_usernames', 'get_friends',
'get_followers', 'get_timeline', 'get_userline', 'get_tweet', 'save_user',
'save_tweet', 'add_friends', 'remove_friends', 'DatabaseError',
'NotFound', 'InvalidDictionary', 'PUBLIC_USERLINE_KEY']
'get_follower_usernames', 'get_users_for_usernames', 'get_friends',
'get_followers', 'get_timeline', 'get_userline', 'get_tweet', 'save_user',
'save_tweet', 'add_friends', 'remove_friends', 'DatabaseError',
'NotFound', 'InvalidDictionary', 'PUBLIC_USERLINE_KEY']
POOL = ConnectionPool('Twissandra')
USER = ColumnFamily(POOL, 'User')
FRIENDS = ColumnFamily(POOL, 'Friends')
FOLLOWERS = ColumnFamily(POOL, 'Followers')
TWEET = ColumnFamily(POOL, 'Tweet')
TIMELINE = ColumnFamily(POOL, 'Timeline')
USERLINE = ColumnFamily(POOL, 'Userline')
CLUSTER = Cluster(['127.0.0.1'])
SESSION = CLUSTER.connect('twissandra')
# NOTE: Having a single userline key to store all of the public tweets is not
# scalable. Currently, Cassandra requires that an entire row (meaning
@@ -48,64 +41,45 @@ class NotFound(DatabaseError):
class InvalidDictionary(DatabaseError):
pass
def _get_friend_or_follower_usernames(cf, username, count):
"""
Gets the social graph (friends or followers) for a username.
"""
try:
friends = cf.get(str(username), column_count=count)
except NotFoundException:
return []
return friends.keys()
def _get_line(cf, username, start, limit):
def _get_line(table, username, start, limit):
"""
Gets a timeline or a userline given a username, a start, and a limit.
"""
# First we need to get the raw timeline (in the form of tweet ids)
query = "SELECT time, tweet_id FROM {table} WHERE username=%s {time_clause} LIMIT %s"
# We get one more tweet than asked for, and if we exceed the limit by doing
# so, that tweet's key (timestamp) is returned as the 'next' key for
# pagination.
start = long(start) if start else ''
next = None
try:
timeline = cf.get(str(username), column_start=start,
column_count=limit + 1, column_reversed=True)
except NotFoundException:
return [], next
if len(timeline) > limit:
# Find the minimum timestamp from our get (the oldest one), and convert
# it to a non-floating value.
oldest_timestamp = min(timeline.keys())
if not start:
time_clause = ''
params = (username, limit)
else:
time_clause = 'AND time < %s'
params = (username, UUID(start), limit)
# Present the string version of the oldest_timestamp for the UI...
next = str(oldest_timestamp)
query = query.format(table=table, time_clause=time_clause)
# And then convert the pylong back to a bitpacked key so we can delete
# if from timeline.
del timeline[oldest_timestamp]
results = SESSION.execute(query, params)
if not results:
return [], None
# Now we do a multiget to get the tweets themselves
tweet_ids = timeline.values()
tweets = TWEET.multiget(tweet_ids)
if len(results) == limit:
# Find the minimum timestamp from our get (the oldest one), and convert
# it to a non-floating value.
oldest_timeuuid = min(row.time for row in results)
# We want to get the information about the user who made the tweet
# First, pull out the list of unique users for our tweets
usernames = list(set([tweet['username'] for tweet in tweets.values()]))
users = USER.multiget(usernames)
# Present the string version of the oldest_timeuuid for the UI...
next_timeuuid = oldest_timeuuid.urn[len('urn:uuid:'):]
else:
next_timeuuid = None
# Then, create a list of tweets with the user record and id
# attached, and the body decoded properly.
result_tweets = list()
for tweet_id, tweet in tweets.iteritems():
tweet['user'] = users.get(tweet['username'])
tweet['body'] = tweet['body'].decode('utf-8')
tweet['id'] = tweet_id
result_tweets.append(tweet)
# Now we fetch the tweets themselves
futures = []
for row in results:
futures.append(SESSION.execute_async(
"SELECT * FROM tweets WHERE tweet_id=%s", (row.tweet_id, )))
return (result_tweets, next)
tweets = [f.result()[0] for f in futures]
return (tweets, next_timeuuid)
# QUERYING APIs
@@ -114,35 +88,53 @@ def get_user_by_username(username):
"""
Given a username, this gets the user record.
"""
try:
user = USER.get(str(username))
except NotFoundException:
rows = SESSION.execute("SELECT * FROM users WHERE username=%s", (username, ))
if not rows:
raise NotFound('User %s not found' % (username,))
return user
else:
return rows[0]
def get_friend_usernames(username, count=5000):
"""
Given a username, gets the usernames of the people that the user is
following.
"""
return _get_friend_or_follower_usernames(FRIENDS, username, count)
rows = SESSION.execute(
"SELECT friend FROM friends WHERE username=%s LIMIT %s",
(username, count))
return [row.friend for row in rows]
def get_follower_usernames(username, count=5000):
"""
Given a username, gets the usernames of the people following that user.
"""
return _get_friend_or_follower_usernames(FOLLOWERS, username, count)
rows = SESSION.execute(
"SELECT follower FROM followers WHERE username=%s LIMIT %s",
(username, count))
return [row['follower'] for row in rows]
def get_users_for_usernames(usernames):
"""
Given a list of usernames, this gets the associated user object for each
one.
"""
try:
users = USER.multiget(map(str, usernames))
except NotFoundException:
raise NotFound('Users %s not found' % (usernames,))
return users.values()
futures = []
for user in usernames:
future = SESSION.execute_async("SELECT * FROM users WHERE username=%s", (user, ))
futures.append(future)
users = []
for user, future in zip(usernames, futures):
results = future.result()
if not results:
raise NotFound('User %s not found' % (user,))
users.append(results[0])
return users
def get_friends(username, count=5000):
"""
@@ -151,92 +143,136 @@ def get_friends(username, count=5000):
friend_usernames = get_friend_usernames(username, count=count)
return get_users_for_usernames(friend_usernames)
def get_followers(username, count=5000):
"""
Given a username, gets the people following that user.
"""
follower_usernames = get_follower_usernames(username, count=count)
return get_users_for_usernames(follower_usernames)
def get_timeline(username, start=None, limit=40):
"""
Given a username, get their tweet timeline (tweets from people they follow).
"""
return _get_line(TIMELINE, username, start, limit)
return _get_line("timeline", username, start, limit)
def get_userline(username, start=None, limit=40):
"""
Given a username, get their userline (their tweets).
"""
return _get_line(USERLINE, username, start, limit)
return _get_line("userline", username, start, limit)
def get_tweet(tweet_id):
"""
Given a tweet id, this gets the entire tweet record.
"""
try:
tweet = TWEET.get(str(tweet_id))
except NotFoundException:
results = SESSION.execute("SELECT * FROM tweets WHERE tweet_id=%s", (tweet_id, ))
if not results:
raise NotFound('Tweet %s not found' % (tweet_id,))
tweet['body'] = tweet['body'].decode('utf-8')
return tweet
else:
return results[0]
def get_tweets_for_tweet_ids(tweet_ids):
"""
Given a list of tweet ids, this gets the associated tweet object for each
one.
"""
try:
tweets = TWEET.multiget(map(str, tweet_ids))
except NotFoundException:
raise NotFound('Tweets %s not found' % (tweet_ids,))
return tweets.values()
futures = []
for tweet_id in tweet_ids:
futures.append(SESSION.execute_async(
"SELECT * FROM tweets WHERE tweet_id=%s", (tweet_id, )))
tweets = []
for tweet_id, future in zip(tweet_id, futures):
result = future.result()
if not result:
raise NotFound('Tweet %s not found' % (tweet_id,))
else:
tweets.append(result[0])
return tweets
# INSERTING APIs
def save_user(username, user):
def save_user(username, password):
"""
Saves the user record.
"""
USER.insert(str(username), user)
SESSION.execute(
"INSERT INTO users (username, password) VALUES (%s, %s)",
(username, password))
def save_tweet(tweet_id, username, tweet, timestamp=None):
"""
Saves the tweet record.
"""
# Generate a timestamp for the USER/TIMELINE
if not timestamp:
ts = long(time.time() * 1e6)
else:
ts = timestamp
# Make sure the tweet body is utf-8 encoded
tweet['body'] = tweet['body'].encode('utf-8')
# TODO don't ignore timestamp
now = uuid1()
# Insert the tweet, then into the user's timeline, then into the public one
TWEET.insert(str(tweet_id), tweet)
USERLINE.insert(str(username), {ts: str(tweet_id)})
USERLINE.insert(PUBLIC_USERLINE_KEY, {ts: str(tweet_id)})
SESSION.execute(
"INSERT INTO tweets (tweet_id, username, body) VALUES (%s, %s, %s)",
(tweet_id, username, tweet))
SESSION.execute(
"INSERT INTO userline (username, time, tweet_id) VALUES (%s, %s, %s)",
(username, now, tweet_id))
SESSION.execute(
"INSERT INTO userline (username, time, tweet_id) VALUES (%s, %s, %s)",
(PUBLIC_USERLINE_KEY, now, tweet_id))
# Get the user's followers, and insert the tweet into all of their streams
futures = []
follower_usernames = [username] + get_follower_usernames(username)
for follower_username in follower_usernames:
TIMELINE.insert(str(follower_username), {ts: str(tweet_id)})
futures.append(SESSION.execute_async(
"INSERT INTO timeline (username, time, tweet_id) VALUES (%s, %s, %s)",
(follower_username, now, tweet_id)))
for future in futures:
future.result()
def add_friends(from_username, to_usernames):
"""
Adds a friendship relationship from one user to some others.
"""
ts = str(int(time.time() * 1e6))
dct = dict((str(username), ts) for username in to_usernames)
FRIENDS.insert(str(from_username), dct)
for to_username in to_usernames:
FOLLOWERS.insert(str(to_username), {str(from_username): ts})
now = datetime.utcnow()
futures = []
for to_user in to_usernames:
futures.append(SESSION.execute_async(
"INSERT INTO friends (username, friend, since) VALUES (%s, %s, %s)",
(from_username, to_user, now)))
futures.append(SESSION.execute_async(
"INSERT INTO followers (username, follower, since) VALUES (%s, %s, %s)",
(to_user, from_username, now)))
for future in futures:
future.result()
def remove_friends(from_username, to_usernames):
"""
Removes a friendship relationship from one user to some others.
"""
FRIENDS.remove(str(from_username), columns=map(str, to_usernames))
for to_username in to_usernames:
FOLLOWERS.remove(str(to_username), columns=[str(from_username)])
futures = []
for to_user in to_usernames:
futures.append(SESSION.execute_async(
"DELETE FROM friends WHERE username=%s AND friend=%s",
(from_username, to_user)))
futures.append(SESSION.execute_async(
"DELETE FROM followers WHERE username=%s AND follower=%s",
(to_user, from_username)))
for future in futures:
future.result()
View
@@ -1,5 +1,4 @@
Django
Django==1.5.5
simplejson
thrift
pycassa
cassandra-driver
loremipsum
@@ -23,8 +23,8 @@ <h2 class="grid_4 suffix_5">{{ username }}&rsquo;s Timeline</h2>
{% if request.user.is_authenticated %}
{% ifnotequal request.user.id user.id %}
<form method="POST" action="{% url 'modify_friend' %}?next={{ request.path }}">
<input type="hidden" name="{% if 'user.friend' %}remove{% else %}add{% endif %}-friend" value="{{ user.id }}" />
<input type="submit" value="{% if 'user.friend' %}Remove{% else %}Add{% endif %} Friend" />
<input type="hidden" name="{% if is_friend %}remove{% else %}add{% endif %}-friend" value="{{ user.id }}" />
<input type="submit" value="{% if is_friend %}Remove{% else %}Add{% endif %} Friend" />
</form>
{% endifnotequal %}
{% else %}
Oops, something went wrong.

1 comment on commit 678abbe

@matthewrudy

This comment has been minimized.

Show comment
Hide comment
@matthewrudy

matthewrudy Mar 1, 2014

wow, this is much clearer,
thanks.

matthewrudy commented on 678abbe Mar 1, 2014

wow, this is much clearer,
thanks.

Please sign in to comment.