-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix parsing of Morgan Stanley statements (#20)
fixes #17
- Loading branch information
Showing
4 changed files
with
154 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
...mentParser/StatementParser/Parsers/Brokers/MorganStanley/PdfModels/StatementModel.2022.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
using ASoft.TextDeserializer.Attributes; | ||
|
||
namespace StatementParser.Parsers.Brokers.MorganStanley.PdfModels | ||
{ | ||
[DeserializeByRegex(@"Issuer Description:\t(?<Name>[^\t\n]+)")] | ||
internal class StatementModel2022 | ||
{ | ||
public string Name { get; set; } | ||
|
||
[DeserializeCollectionByRegex( | ||
@"([0-9]{1,2}/[0-9]{1,2}/[0-9]{2})", | ||
@"\nGross\nTransaction Date\tActivity Type\tQuantity\tPrice\tAmount\tTotal Taxes and Fees\tTotal Net Amount\n((?s).+?)\nSell Transactions are provided as of trade date\.")] | ||
public TransactionModel2022[] Transactions { get; set; } | ||
} | ||
} |
43 changes: 43 additions & 0 deletions
43
...ntParser/StatementParser/Parsers/Brokers/MorganStanley/PdfModels/TransactionModel.2022.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
using System; | ||
using System.Globalization; | ||
|
||
using ASoft.TextDeserializer.Attributes; | ||
|
||
namespace StatementParser.Parsers.Brokers.MorganStanley.PdfModels | ||
{ | ||
[DeserializeByRegex( | ||
@"^(?<Date>[0-9]{1,2}/[0-9]{1,2}/[0-9]{2})" + | ||
@"\t(?<Type>Share Deposit|Dividend Credit|Withholding Tax|Dividend Reinvested|Sale|Proceeds Disbursement)" + | ||
@"(?:" + | ||
@"\t(?<Quantity>[0-9,]+\.[0-9]{3})" + | ||
@"\t\$?(?<Price>[0-9,]+\.[0-9]{4})" + | ||
@")?" + | ||
@"(?:" + | ||
@"(?:\t\$?(?<GrossAmount>[0-9,]+\.[0-9]{2}))?" + | ||
@"(?:\t\$?(?<TotalTaxesAndFees>[0-9,]+\.[0-9]{2}))?" + | ||
@"\t\$?(?<TotalNetAmountRaw>\(?[0-9,]+\.[0-9]{2})\)?" + | ||
@")?$")] | ||
internal class TransactionModel2022 | ||
{ | ||
public DateTime Date { get; set; } | ||
|
||
public string Type { get; set; } | ||
|
||
public decimal Quantity { get; set; } | ||
|
||
public decimal Price { get; set; } | ||
|
||
public decimal GrossAmount { get; set; } | ||
|
||
public decimal TotalTaxesAndFees { get; set; } | ||
|
||
public string TotalNetAmountRaw { get; set; } | ||
|
||
public decimal TotalNetAmount => Convert.ToDecimal(TotalNetAmountRaw?.Replace('(', '-').TrimEnd(')'), new CultureInfo("en-US")); | ||
|
||
public override string ToString() | ||
{ | ||
return $"{nameof(Date)}: {Date} {nameof(Type)}: {Type} {nameof(Quantity)}: {Quantity} {nameof(Price)}: {Price} {nameof(GrossAmount)}: {GrossAmount} {nameof(TotalTaxesAndFees)}: {TotalTaxesAndFees} {nameof(TotalNetAmount)}: {TotalNetAmount}"; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,80 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
using ASoft.TextDeserializer; | ||
using UglyToad.PdfPig; | ||
using UglyToad.PdfPig.Content; | ||
|
||
namespace StatementParser.Parsers | ||
{ | ||
internal class TextSource : ITextSource | ||
{ | ||
private readonly PdfDocument document; | ||
private readonly bool improved; | ||
|
||
public TextSource(string filePath) | ||
public TextSource(string filePath, bool improved = false) | ||
{ | ||
_ = filePath ?? throw new ArgumentNullException(nameof(filePath)); | ||
|
||
this.document = PdfDocument.Open(filePath); | ||
this.improved = improved; | ||
} | ||
|
||
public string Title | ||
=> this.document.Information.Title; | ||
|
||
public IEnumerable<string> GetPagesText() | ||
{ | ||
return this.document.GetPages().Select(i => i.Text); | ||
if (!improved) | ||
{ | ||
return this.document.GetPages().Select(i => i.Text); | ||
} | ||
else | ||
{ | ||
return this.document.GetPages().Select(i => GetPageText(i)); | ||
} | ||
} | ||
|
||
public void Dispose() | ||
{ | ||
this.document.Dispose(); | ||
} | ||
|
||
/// <summary> | ||
/// Like <c>Page.Text</c>, but tries to retain more of the original | ||
/// document's structure by adding whitespace like '\n', '\t' and ' '. | ||
/// </summary> | ||
private string GetPageText(Page page) | ||
{ | ||
var sb = new StringBuilder(); | ||
var lastBottom = 0.0; | ||
var lastRight = 0.0; | ||
|
||
foreach (var word in page.GetWords()) | ||
{ | ||
if (sb.Length > 0 && Math.Abs(word.BoundingBox.Bottom - lastBottom) > word.BoundingBox.Height / 2) | ||
{ | ||
// The vertical position differs too much, separate to a new row | ||
sb.Append('\n'); | ||
} | ||
else if (sb.Length > 0 && sb[^1] != '\n' && word.BoundingBox.Left - lastRight > word.BoundingBox.Height) | ||
{ | ||
// The horizontal gap between the right edge of the previous word and the left edge of this word is too big, separate to a new column | ||
sb.Append('\t'); | ||
} | ||
lastBottom = word.BoundingBox.Bottom; | ||
lastRight = word.BoundingBox.Right; | ||
|
||
if (sb.Length > 0 && !char.IsWhiteSpace(sb[^1])) | ||
{ | ||
// Make sure there are spaces between words | ||
sb.Append(' '); | ||
} | ||
sb.Append(word.Text); | ||
} | ||
|
||
return sb.ToString(); | ||
} | ||
} | ||
} |