Skip to content

Commit

Permalink
Fix parsing of Morgan Stanley statements (#20)
Browse files Browse the repository at this point in the history
fixes #17
  • Loading branch information
yirkha authored Apr 23, 2024
1 parent f9ec139 commit a28a079
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 16 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using ASoft.TextDeserializer;
using ASoft.TextDeserializer.Exceptions;
using StatementParser.Models;
Expand All @@ -11,9 +12,9 @@ namespace StatementParser.Parsers.Brokers.MorganStanley
internal class MorganStanleyStatementPdfParser : ITransactionParser
{
private bool CanParse(string statementFilePath)
{
return File.Exists(statementFilePath) && Path.GetExtension(statementFilePath).ToLowerInvariant() == ".pdf";
}
{
return File.Exists(statementFilePath) && Path.GetExtension(statementFilePath).ToLowerInvariant() == ".pdf";
}

public IList<Transaction> Parse(string statementFilePath)
{
Expand All @@ -22,23 +23,26 @@ public IList<Transaction> Parse(string statementFilePath)
return null;
}

var transactions = new List<Transaction>();

using (var textSource = new TextSource(statementFilePath))
using var textSource = new TextSource(statementFilePath, true);
try
{
try
if (Regex.IsMatch(textSource.Title, @"^Morgan Stanley Smith Barney Document SP10 History Statements "))
{
var parsedDocument = new TextDocumentParser<StatementModel>().Parse(textSource);

transactions.AddRange(GetTransactions(parsedDocument));
return ParseLegacyStatement(textSource);
}
catch (TextException)
else if (Regex.IsMatch(textSource.Title, @"^Morgan Stanley Smith Barney Document EPS217CCC linux-TTF New$"))
{
return Parse2022Statement(textSource);
}
else
{
return null;
}
}

return transactions;
catch (TextException)
{
return null;
}
}

private decimal SearchForTax(StatementModel statementModel, TransactionModel transactionModel)
Expand All @@ -48,8 +52,34 @@ private decimal SearchForTax(StatementModel statementModel, TransactionModel tra
.Select(i => i.NetAmount).FirstOrDefault();
}

private IEnumerable<Transaction> GetTransactions(StatementModel statementModel)
private IList<Transaction> ParseLegacyStatement(TextSource textSource)
{
var statementModel = new TextDocumentParser<StatementModel>().Parse(textSource);

var output = new List<Transaction>();

output.AddRange(statementModel.Transactions
.Where(i => i.Type == "Share Deposit")
.Select(i => new DepositTransaction(Broker.MorganStanley, i.Date, statementModel.Name, i.Quantity, i.Price, Currency.USD)));

output.AddRange(statementModel.Transactions
.Where(i => i.Type == "Dividend Credit")
.Select(i => new DividendTransaction(Broker.MorganStanley, i.Date, statementModel.Name, i.GrossAmount, SearchForTax(statementModel, i), Currency.USD)));

return output;
}

private decimal SearchForTax(StatementModel2022 statementModel, TransactionModel2022 transactionModel)
{
return statementModel.Transactions
.Where(i => i.Type == "Withholding Tax" && i.Date == transactionModel.Date)
.Select(i => i.TotalNetAmount).FirstOrDefault();
}

private IList<Transaction> Parse2022Statement(TextSource textSource)
{
var statementModel = new TextDocumentParser<StatementModel2022>().Parse(textSource);

var output = new List<Transaction>();

output.AddRange(statementModel.Transactions
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
using ASoft.TextDeserializer.Attributes;

namespace StatementParser.Parsers.Brokers.MorganStanley.PdfModels
{
[DeserializeByRegex(@"Issuer Description:\t(?<Name>[^\t\n]+)")]
internal class StatementModel2022
{
public string Name { get; set; }

[DeserializeCollectionByRegex(
@"([0-9]{1,2}/[0-9]{1,2}/[0-9]{2})",
@"\nGross\nTransaction Date\tActivity Type\tQuantity\tPrice\tAmount\tTotal Taxes and Fees\tTotal Net Amount\n((?s).+?)\nSell Transactions are provided as of trade date\.")]
public TransactionModel2022[] Transactions { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using System;
using System.Globalization;

using ASoft.TextDeserializer.Attributes;

namespace StatementParser.Parsers.Brokers.MorganStanley.PdfModels
{
[DeserializeByRegex(
@"^(?<Date>[0-9]{1,2}/[0-9]{1,2}/[0-9]{2})" +
@"\t(?<Type>Share Deposit|Dividend Credit|Withholding Tax|Dividend Reinvested|Sale|Proceeds Disbursement)" +
@"(?:" +
@"\t(?<Quantity>[0-9,]+\.[0-9]{3})" +
@"\t\$?(?<Price>[0-9,]+\.[0-9]{4})" +
@")?" +
@"(?:" +
@"(?:\t\$?(?<GrossAmount>[0-9,]+\.[0-9]{2}))?" +
@"(?:\t\$?(?<TotalTaxesAndFees>[0-9,]+\.[0-9]{2}))?" +
@"\t\$?(?<TotalNetAmountRaw>\(?[0-9,]+\.[0-9]{2})\)?" +
@")?$")]
internal class TransactionModel2022
{
public DateTime Date { get; set; }

public string Type { get; set; }

public decimal Quantity { get; set; }

public decimal Price { get; set; }

public decimal GrossAmount { get; set; }

public decimal TotalTaxesAndFees { get; set; }

public string TotalNetAmountRaw { get; set; }

public decimal TotalNetAmount => Convert.ToDecimal(TotalNetAmountRaw?.Replace('(', '-').TrimEnd(')'), new CultureInfo("en-US"));

public override string ToString()
{
return $"{nameof(Date)}: {Date} {nameof(Type)}: {Type} {nameof(Quantity)}: {Quantity} {nameof(Price)}: {Price} {nameof(GrossAmount)}: {GrossAmount} {nameof(TotalTaxesAndFees)}: {TotalTaxesAndFees} {nameof(TotalNetAmount)}: {TotalNetAmount}";
}
}
}
54 changes: 52 additions & 2 deletions StatementParser/StatementParser/Parsers/TextSource.cs
Original file line number Diff line number Diff line change
@@ -1,30 +1,80 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using ASoft.TextDeserializer;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;

namespace StatementParser.Parsers
{
internal class TextSource : ITextSource
{
private readonly PdfDocument document;
private readonly bool improved;

public TextSource(string filePath)
public TextSource(string filePath, bool improved = false)
{
_ = filePath ?? throw new ArgumentNullException(nameof(filePath));

this.document = PdfDocument.Open(filePath);
this.improved = improved;
}

public string Title
=> this.document.Information.Title;

public IEnumerable<string> GetPagesText()
{
return this.document.GetPages().Select(i => i.Text);
if (!improved)
{
return this.document.GetPages().Select(i => i.Text);
}
else
{
return this.document.GetPages().Select(i => GetPageText(i));
}
}

public void Dispose()
{
this.document.Dispose();
}

/// <summary>
/// Like <c>Page.Text</c>, but tries to retain more of the original
/// document's structure by adding whitespace like '\n', '\t' and ' '.
/// </summary>
private string GetPageText(Page page)
{
var sb = new StringBuilder();
var lastBottom = 0.0;
var lastRight = 0.0;

foreach (var word in page.GetWords())
{
if (sb.Length > 0 && Math.Abs(word.BoundingBox.Bottom - lastBottom) > word.BoundingBox.Height / 2)
{
// The vertical position differs too much, separate to a new row
sb.Append('\n');
}
else if (sb.Length > 0 && sb[^1] != '\n' && word.BoundingBox.Left - lastRight > word.BoundingBox.Height)
{
// The horizontal gap between the right edge of the previous word and the left edge of this word is too big, separate to a new column
sb.Append('\t');
}
lastBottom = word.BoundingBox.Bottom;
lastRight = word.BoundingBox.Right;

if (sb.Length > 0 && !char.IsWhiteSpace(sb[^1]))
{
// Make sure there are spaces between words
sb.Append(' ');
}
sb.Append(word.Text);
}

return sb.ToString();
}
}
}

0 comments on commit a28a079

Please sign in to comment.