# Split Twitter text into individual tweets

Based on https://github.com/paulfurley/python-tweet-splitter, https://github.com/glyph/twitter-text-py & https://github.com/twitter/twitter-text/

First run all the cells below to load classes.

In [85]:
# Place text below
text = """
"""

In [86]:
ts = TweetSplitter(text)
for t in ts.split():
    print(t)
    print("")






In [79]:
"""
Enhancements includes:

* tweet-text validation (based on https://github.com/twitter/twitter-text/) 
* natural sentence breaks, plus maintains paragraph structure
"""

from nltk import tokenize

config = {
    "defaults": {
        "version": 3,
        "maxWeightedTweetLength": 280,
        "scale": 100,
        "defaultWeight": 200,
        "emojiParsingEnabled": True,
        "transformedURLLength": 23,
        "ranges": [
          { "start": 0, "end": 4351, "weight": 100 },
          { "start": 8192, "end": 8205, "weight": 100 },
          { "start": 8208, "end": 8223, "weight": 100 },
          { "start": 8242, "end": 8247, "weight": 100 }
        ]
    }
}

class TweetSplitter:
    
    def __init__(self, text, **kwargs):
        self.max_length = config.get("defaults", {}).get("maxWeightedTweetLength")
        # Respect existing structure
        self.text = text
        self.split_text = []
        self.dangle = kwargs.get("dangle", 3) # minimum hanging start of sentence

    def split(self):
        if len(self.text) <= self.max_length:
            return [self.text]
        text = [t for t in self.text.splitlines() if t.strip()]
        for t in text:
            if self._tweet_length(t) <= self.max_length:
                self.split_text.append(t)
            else:
                mid_split = self._split_sentences(tokenize.sent_tokenize(t))
                self.split_text.extend(mid_split)
        return self.split_text

    def _split_sentences(self, sentences):
        """
        Return a list of tweets that are each less than max-length, but split
        appropriately.
        
        * Split and test each sentence. 
        * If only two, and each < max_length, return as is.
        * If > 2, test combination of sentences that < max_length.
        * As last resort, split a sentence ...
        * 1st test if can add to previous sentence/tweet, then split this one.
        * Optimise tweet length.
        """
        new_tweets = []
        hanging_twit = None
        if self._tweet_length(sentences[0]) > self.max_length:
            first_sentence = sentences.pop(0)
            first_twits = list(self._generate_split_tweets(first_sentence.split(" ")))
            new_tweets.extend(first_twits[:-1])
            hanging_twit = first_twits[-1].strip()
        while len(sentences):
            text = sentences.pop(0)
            if hanging_twit:
                sub_twits = self._create_tweets(hanging_twit, text)
            else:
                sub_twits = self._create_tweets(text, sentences.pop(0))
            hanging_twit = sub_twits[-1].strip()
            new_tweets.extend(sub_twits[:-1])
        new_tweets.append(hanging_twit)
        return new_tweets
            
    def _create_tweets(self, text1, text2, ignore_dangle = False):
        """
        Receive two strings. Optimise their correspondence to fit max_length.
        
        * Join them if their combined length < max_length (i.e. +1 for the ' ')
        * Split and test the number of words that can be added to text1 subject to dangle
        * Otherwise return text2 split ... if text2 split still > max_length, recurse.
        """
        if self._tweet_length(text1) + self._tweet_length(text2) < self.max_length:
            return [" ".join([text1, text2])]
        text2_words = text2.split(" ")
        if not ignore_dangle and len(text2_words) > self.dangle:
            dangle_length = sum([self._tweet_length(text2_words[d]) for d in range(self.dangle)])
            if self._tweet_length(text1) + dangle_length <= self.max_length:
                ignore_dangle = True
        while ignore_dangle:
            if self._tweet_length(text1) + self._tweet_length(text2_words[0]) < self.max_length:
                text1 += " " + text2_words.pop(0)
            else:
                break
        if self._tweet_length(" ".join(text2_words)) <= self.max_length:
            return [text1, " ".join(text2_words)]
        # Edge case of text2 still being > max_length ...
        return [text1] + list(self._generate_split_tweets(text2_words))

    def _generate_split_tweets(self, words):
        this_tweet = None
        while True:
            if this_tweet:
                this_tweet = " ".join([this_tweet, words.pop(0)])
            else:
                this_tweet = words.pop(0)
            if not words:
                break
            if self._tweet_length(this_tweet) + self._tweet_length(words[0]) > self.max_length:
                yield this_tweet
                this_tweet = None
        yield this_tweet

    def _tweet_length(self, text):
        """
        Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
        (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
        string no matter which actual form was transmitted. For example:
             U+0065  Latin Small Letter E
         +   U+0301  Combining Acute Accent
         ----------
         =   2 bytes, 2 characters, displayed as é (1 visual glyph)
             … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
         The string could also contain U+00E9 already, in which case the canonicalization will not change the value.

        Source: https://github.com/glyph/twitter-text-py/blob/master/twitter_text/validation.py
        """
        length = len(text)
        for url in Extractor(text).extract_urls_with_indices():
            # remove the link of the original URL
            length += url['indices'][0] - url['indices'][1]
            # add the length of the t.co URL that will replace it
            length += config.get("defaults", {}).get("transformedURLLength")
        return length

In [20]:
class Extractor:
    """
    A module for including Tweet parsing in a class. This module provides function for the extraction and processing
    of usernames, lists, URLs and hashtags.
    
    Source: https://github.com/glyph/twitter-text-py/blob/master/twitter_text/extractor.py
    """
    def __init__(self, text):
        self.text = text
        
    def extract_urls_with_indices(self, options = {'extract_url_without_protocol': True}):
        """
        Extracts a list of all URLs included in the Tweet text along
        with the indices. If the text is None or contains no
        URLs an empty list will be returned.
        If a block is given then it will be called for each URL.
        """
        urls = []
        for match in REGEXEN['valid_url'].finditer(self.text):
            complete, before, url, protocol, domain, port, path, query = match.groups()
            start_position = match.start() + len(before or '')
            end_position = match.end()
            # If protocol is missing and domain contains non-ASCII characters,
            # extract ASCII-only domains.
            if not protocol:
                if not options.get('extract_url_without_protocol') or REGEXEN['invalid_url_without_protocol_preceding_chars'].search(before):
                    continue
                last_url = None
                last_url_invalid_match = None
                for ascii_domain in REGEXEN['valid_ascii_domain'].finditer(domain):
                    ascii_domain = ascii_domain.group()
                    last_url = {
                        'url':      ascii_domain,
                        'indices':  [start_position - len(before or '') + complete.find(ascii_domain), start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)]
                    }
                    last_url_invalid_match = REGEXEN['invalid_short_domain'].search(ascii_domain) is not None
                    if not last_url_invalid_match:
                        urls.append(last_url)
                # no ASCII-only domain found. Skip the entire URL
                if not last_url:
                    continue
                if path:
                    last_url['url'] = url.replace(domain, last_url['url'])
                    last_url['indices'][1] = end_position
                    if last_url_invalid_match:
                        urls.append(last_url)
            else:
                if REGEXEN['valid_tco_url'].match(url):
                    url = REGEXEN['valid_tco_url'].match(url).group()
                    end_position = start_position + len(url)
                urls.append({
                    'url':      url,
                    'indices':  [start_position, end_position]
                })
        return urls


In [17]:
# Source: https://github.com/glyph/twitter-text-py/blob/master/twitter_text/regex.py

import re
try:
    unichr
except NameError:
    unichr = chr
from functools import reduce

def regex_range(start, end = None):
    if end:
        return u'%s-%s' % (unichr(start), unichr(end))
    else:
        return u'%s' % unichr(start)

REGEXEN = {}
PUNCTUATION_CHARS = u'!"#$%&\'()*+,-./:;<=>?@\\[\\]^_\\`{|}~'
SPACE_CHARS = u" \\t\\n\\x0B\\f\\r"
CTRL_CHARS = u"\\x00-\\x1F\\x7F"
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
# to access both the list of characters and a pattern suitible for use with String#split
#  Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
UNICODE_SPACES = []
for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [
        list(range(0x0009, 0x000D)),  # White_Space # Cc   [5] <control-0009>..<control-000D>
        0x0020,                 # White_Space # Zs       SPACE
        0x0085,                 # White_Space # Cc       <control-0085>
        0x00A0,                 # White_Space # Zs       NO-BREAK SPACE
        0x1680,                 # White_Space # Zs       OGHAM SPACE MARK
        0x180E,                 # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
        list(range(0x2000, 0x200A)),  # White_Space # Zs  [11] EN QUAD..HAIR SPACE
        0x2028,                 # White_Space # Zl       LINE SEPARATOR
        0x2029,                 # White_Space # Zp       PARAGRAPH SEPARATOR
        0x202F,                 # White_Space # Zs       NARROW NO-BREAK SPACE
        0x205F,                 # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
        0x3000,                 # White_Space # Zs       IDEOGRAPHIC SPACE
    ]):
    UNICODE_SPACES.append(unichr(space))
REGEXEN['spaces'] = re.compile(u''.join(UNICODE_SPACES))
# Latin accented characters
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
# Also excludes 0xf7, the division sign
LATIN_ACCENTS = [
    regex_range(0x00c0, 0x00d6),
    regex_range(0x00d8, 0x00f6),
    regex_range(0x00f8, 0x00ff),
    regex_range(0x0100, 0x024f),
    regex_range(0x0253, 0x0254),
    regex_range(0x0256, 0x0257),
    regex_range(0x0259),
    regex_range(0x025b),
    regex_range(0x0263),
    regex_range(0x0268),
    regex_range(0x026f),
    regex_range(0x0272),
    regex_range(0x0289),
    regex_range(0x028b),
    regex_range(0x02bb),
    regex_range(0x0300, 0x036f),
    regex_range(0x1e00, 0x1eff),
]
REGEXEN['latin_accents'] = re.compile(u''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE)
LATIN_ACCENTS = u''.join(LATIN_ACCENTS)
# URL related hash regex collection
REGEXEN['valid_url_preceding_chars'] = re.compile(u'(?:[^A-Z0-9@＠$#＃%s]|^)', re.IGNORECASE | re.UNICODE)
REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(u'[-_.\\/]$')
DOMAIN_VALID_CHARS = u'[^%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, u''.join(UNICODE_SPACES))
REGEXEN['valid_subdomain'] = re.compile(u'(?:(?:%s(?:[_-]|%s)*)?%s\\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
REGEXEN['valid_domain_name'] = re.compile(u'(?:(?:%s(?:[-]|%s)*)?%s\\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
REGEXEN['valid_gTLD'] = re.compile(u'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
REGEXEN['valid_ccTLD'] = re.compile(u'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
REGEXEN['valid_punycode'] = re.compile(u'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE)

REGEXEN['valid_domain'] = re.compile(u'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)

# This is used in Extractor
REGEXEN['valid_ascii_domain'] = re.compile(u'(?:(?:[A-Za-z0-9\\-_]|[%s])+\\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)

# This is used in Extractor for stricter t.co URL extraction
REGEXEN['valid_tco_url'] = re.compile(u'^https?:\\/\\/t\\.co\\/[a-z0-9]+', re.IGNORECASE | re.UNICODE)

# This is used in Extractor to filter out unwanted URLs.
REGEXEN['invalid_short_domain'] = re.compile(u'\\A%s%s\\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE)

REGEXEN['valid_port_number'] = re.compile(u'[0-9]+')

REGEXEN['valid_general_url_path_chars'] = re.compile(u"[a-z0-9!\\*';:=\\+\\,\\.\\$\\/%%#\\[\\]\\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE)
# Allow URL paths to contain balanced parens
#  1. Used in Wikipedia URLs like /Primer_(film)
#  2. Used in IIS sessions like /S(dfd346)/
REGEXEN['valid_url_balanced_parens'] = re.compile(u'\\(%s+\\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE)
# Valid end-of-path chracters (so /foo. does not gobble the period).
#   1. Allow =&# for empty URL parameters and other URL-join artifacts
REGEXEN['valid_url_path_ending_chars'] = re.compile(u'[a-z0-9=_#\\/\\+\\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)
REGEXEN['valid_url_path'] = re.compile(u'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE)

REGEXEN['valid_url_query_chars'] = re.compile(u"[a-z0-9!?\\*'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~|@]", re.IGNORECASE | re.UNICODE)
REGEXEN['valid_url_query_ending_chars'] = re.compile(u'[a-z0-9_&=#\\/]', re.IGNORECASE | re.UNICODE)
REGEXEN['valid_url'] = re.compile(u'((%s)((https?:\\/\\/)?(%s)(?::(%s))?(/%s*)?(\\?%s*%s)?))' % (
    REGEXEN['valid_url_preceding_chars'].pattern,
    REGEXEN['valid_domain'].pattern,
    REGEXEN['valid_port_number'].pattern,
    REGEXEN['valid_url_path'].pattern,
    REGEXEN['valid_url_query_chars'].pattern,
    REGEXEN['valid_url_query_ending_chars'].pattern
), re.IGNORECASE | re.UNICODE)