In [96]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import ftfy
import pickle
from datetime import datetime

In [97]:
df = pd.read_csv("isw_reports.csv")

In [98]:
df

Unnamed: 0,date,report_text,url
0,2022-02-24,"Mason Clark, George Barros, and Kateryna Stepa...",https://www.understandingwar.org/backgrounder/...
1,2022-02-25,"Mason Clark, George Barros, and Kateryna Stepa...",https://www.understandingwar.org/backgrounder/...
2,2022-02-26,"Mason Clark, George Barros, and Katya Stepanen...",https://www.understandingwar.org/backgrounder/...
3,2022-02-27,"Mason Clark, George Barros, and Kateryna Stepa...",https://www.understandingwar.org/backgrounder/...
4,2022-02-28,"Mason Clark, George Barros, and Kateryna Stepa...",https://www.understandingwar.org/backgrounder/...
...,...,...,...
1087,2025-02-25,"Russian Offensive Campaign Assessment, Februar...",https://www.understandingwar.org/backgrounder/...
1088,2025-02-26,"Russian Offensive Campaign Assessment, Februar...",https://www.understandingwar.org/backgrounder/...
1089,2025-02-27,"Russian Offensive Campaign Assessment, Februar...",https://www.understandingwar.org/backgrounder/...
1090,2025-02-28,"Russian Offensive Campaign Assessment, Februar...",https://www.understandingwar.org/backgrounder/...


In [99]:
# 1. QUALITY CHECKS

In [100]:
df['date'] = pd.to_datetime(df['date'])

In [101]:
existing_dates = pd.Series(df['date'].unique())

In [102]:
start_range = datetime(2022, 2, 24)
end_range = datetime(2025, 3, 1)

In [103]:
full_range = pd.date_range(start=start_range, end=end_range, freq='D')

In [104]:
missing_dates = full_range.difference(existing_dates)

In [105]:
# the isw_reports table covers all but ten of the existing dates from our analysis period.
# a manual check confirms that reports for the following dates are not present on understandingwar.org
# these values will be handled during the table merge

missing_dates

DatetimeIndex(['2022-11-24', '2022-12-25', '2023-01-01', '2023-11-23',
               '2023-12-25', '2024-01-01', '2024-10-08', '2024-11-28',
               '2024-12-25', '2025-01-01'],
              dtype='datetime64[ns]', freq=None)

In [None]:
"""
DO NOT EXIST:
Russian Offensive Campaign Assessment, November 24, 2022
Russian Offensive Campaign Assessment, December 25, 2022
Russian Offensive Campaign Assessment, January 1, 2023
Russian Offensive Campaign Assessment, November 23, 2023
Russian Offensive Campaign Assessment, December 25, 2023
Russian Offensive Campaign Assessment, January 1, 2024
Russian Offensive Campaign Assessment, October 8, 2024
Russian Offensive Campaign Assessment, November 28, 2024
Russian Offensive Campaign Assessment, December 25, 2024
Russian Offensive Campaign Assessment, January 1, 2025
"""

In [106]:
# 2. VECTORISATION

In [74]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [107]:
lemmatizer = WordNetLemmatizer()

In [108]:
stop_words = set(stopwords.words('english'))
custom_stops = {
    # report metadata
    'isw', 'report', 'assessment', 'update', 'backgrounder', 'pm', 'est',
    'eet', 'local', 'time', 'et', 'key', 'takeaway', 'item', 'watch',
    'click', 'map', 'interactive', 'see', 'figure', 'source', 'url', 'http',
    'https', 'www', 'published', 'updated', 'accessed', 'twitter', 'telegram',
    'note', 'isws', 'daily', 'reference', 'statement', 'backgrounder',

    # generic time references
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
    'september', 'october', 'november', 'december', 'monday', 'tuesday',
    'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'day', 'week',
    'month', 'year', 'hour', 'date', 'recent', 'recently', 'past', 'future',

    # generic verbs
    'include', 'including', 'also', 'may', 'provide', 'provides', 'provided',
    'providing', 'conduct', 'conducts', 'conducted', 'conducting',
    'continue', 'continues', 'continued', 'continuing', 'develop', 'develops',
    'developed', 'developing', 'indicate', 'indicates', 'indicated',
    'indicating', 'use', 'using', 'used', 'state', 'stated', 'claim', 'claimed',
    'assess', 'assessed',

    # generic nouns
    'area', 'effort', 'system', 'process', 'part', 'level', 'type', 'way',
    'situation', 'presence', 'resource', 'result', 'status', 'structure',
    'support', 'basis', 'center', 'change', 'condition', 'facility',
    'material', 'measure', 'member', 'number', 'order', 'percent',
    'security', 'series', 'service', 'term', 'people', 'city', 'region',
    'plan', 'objective', 'potential', 'capability', 'capacity',

    # generic connectives
    'however', 'unspecified', 'element', 'although', 'another', 'available',
    'following', 'former', 'main', 'need', 'public', 'publicly', 'still',
    'throughout', 'well', 'would', 'yet', 'ability', 'able', 'access',

    # authors
    'fredrick', 'kagan', 'george', 'barros', 'kateryna', 'katya',
    'stepanenko', 'karolina', 'hird', 'mason', 'clark', 'frederick',
    'grace', 'mappes', 'katherine', 'lawlor', 'frederick', 'layne',
    'philipson', 'angela', 'howard', 'riley', 'bailey', 'nicole',
    'wolkov', 'angelica', 'evans', 'christina', 'harward',
}
stop_words.update(custom_stops)

In [109]:
def preprocess_text(text):
    text = ftfy.fix_text(text)

    # author line patterns
    text = re.sub(r"Russian Offensive Campaign Assessment,.*?\d{1,2}:\d{2}\s*(?:am|pm)\s*ET", "", text, flags=re.IGNORECASE)
    # common map links
    text = re.sub(r"Click here to see ISW’s interactive map.*?\.", "", text, flags=re.IGNORECASE)
    # bracketed numbers
    text = re.sub(r'\[\d+\]', '', text)

    text = text.lower()

    # punctuation
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'\s-\s|\s-$|^-', ' ', text)

    # tokenization
    tokens = word_tokenize(text)

    # lemmatization
    processed_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
        if word not in stop_words and len(word) > 2 and not word.startswith('-') and not word.endswith('-')
    ]

    return ' '.join(processed_tokens)

In [110]:
df['processed_text'] = df['report_text'].apply(preprocess_text)

In [111]:
df['processed_text']

0       russian president vladimir putin began large-s...
1       russian force entered major ukrainian citiesin...
2       russian force ax advance last hour focused kyi...
3       russian military likely recognized initial exp...
4       russian military reorganizing military effort ...
                              ...                        
1087    russian invasion ukraine alongside static map ...
1088    russian offensive campaign davit gasparyan rus...
1089    russian invasion ukraine alongside static map ...
1090    russian invasion ukraine alongside static map ...
1091    russian invasion ukraine alongside static map ...
Name: processed_text, Length: 1092, dtype: object

In [112]:
# optimal values based on the analysis of the resulting vectorised data.
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_df=0.80,
    min_df=5,
    max_features=10000,
    sublinear_tf=True
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

In [113]:
# initially we tended to use fewer max_features and not use TruncatedSVD, but it caused memory problems
# when working with a merged dataset. to solve this problem we decided to use TruncatedSVD,
# and to cover the potential loss of information with the increased the number of max_features

n_components = 150
svd = TruncatedSVD(n_components=n_components) # random_state=1
tfidf_svd_matrix_dense = svd.fit_transform(tfidf_matrix)

In [114]:
with open('tfidf_vectorizer.pkl', 'wb') as f_tfidf:
    pickle.dump(tfidf_vectorizer, f_tfidf)

with open('svd_reducer.pkl', 'wb') as f_svd:
    pickle.dump(svd, f_svd)

In [115]:
svd_feature_names = [f'svd_comp_{i+1}' for i in range(n_components)]
df_tfidf_svd = pd.DataFrame(tfidf_svd_matrix_dense, columns=svd_feature_names, index=df.index)

In [116]:
df_tfidf_svd['date'] = df['date']
date_col = df_tfidf_svd.pop('date') # make the 'date' column appear first
df_tfidf_svd.insert(0, 'date', date_col)

In [117]:
df_tfidf_svd

Unnamed: 0,date,svd_comp_1,svd_comp_2,svd_comp_3,svd_comp_4,svd_comp_5,svd_comp_6,svd_comp_7,svd_comp_8,svd_comp_9,...,svd_comp_141,svd_comp_142,svd_comp_143,svd_comp_144,svd_comp_145,svd_comp_146,svd_comp_147,svd_comp_148,svd_comp_149,svd_comp_150
0,2022-02-24,0.231406,0.113453,0.127448,0.105655,0.128726,0.172093,-0.113485,-0.079844,-0.158733,...,-0.036413,0.026344,-0.005851,-0.027000,-0.005885,0.027045,-0.031285,0.051623,0.007152,0.016031
1,2022-02-25,0.236842,0.122241,0.135436,0.100210,0.141044,0.175319,-0.151957,-0.089340,-0.171017,...,0.000529,0.016313,-0.012013,-0.018154,-0.026495,0.022813,-0.045699,0.039631,-0.003964,0.005916
2,2022-02-26,0.252856,0.157072,0.168624,0.109284,0.152090,0.220771,-0.161296,-0.093684,-0.186725,...,0.004067,-0.025538,0.013086,-0.042489,0.011772,-0.003432,-0.018679,-0.020046,-0.003864,-0.006122
3,2022-02-27,0.250906,0.152892,0.178549,0.118152,0.157456,0.206573,-0.144489,-0.102788,-0.180502,...,0.029428,-0.009811,0.027759,-0.030727,-0.007179,-0.026978,0.012720,-0.041540,0.013072,-0.005067
4,2022-02-28,0.248396,0.147887,0.166173,0.111112,0.131580,0.198621,-0.111919,-0.079716,-0.175104,...,-0.007494,-0.051478,0.045730,0.009817,0.012191,-0.053523,0.011049,-0.015222,-0.034729,-0.014278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087,2025-02-25,0.434699,-0.290698,0.137020,-0.134252,0.054225,-0.001801,-0.071009,0.194973,-0.013816,...,-0.007506,-0.024911,-0.029233,0.013226,0.013393,-0.026268,-0.004354,0.026317,0.012937,-0.016769
1088,2025-02-26,0.406357,-0.273944,0.157600,-0.161494,0.059594,0.004037,-0.081583,0.239852,-0.024772,...,-0.002012,-0.003368,-0.039393,0.005027,-0.001091,0.017131,0.040327,0.031117,-0.000931,-0.014690
1089,2025-02-27,0.407333,-0.311196,0.182906,-0.172292,0.098146,-0.002363,-0.102321,0.238852,-0.010877,...,0.030724,0.018139,-0.024595,-0.011180,0.005215,0.003520,0.021557,-0.008575,-0.032183,-0.014421
1090,2025-02-28,0.434414,-0.296324,0.159768,-0.159307,0.075228,-0.005174,-0.057242,0.169707,-0.010135,...,-0.001739,0.016067,-0.045708,0.016310,0.002560,-0.026677,-0.005022,-0.008501,-0.042639,-0.025572


In [118]:
df_tfidf_svd.to_csv('isw_reports_prepared.csv', index=False)